2 * summarysharedcommand.cpp
5 * Created by Sarah Westcott on 1/2/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "summarysharedcommand.h"
11 #include "sharedsobscollectsummary.h"
12 #include "sharedchao1.h"
13 #include "sharedace.h"
14 #include "sharednseqs.h"
15 #include "sharedjabund.h"
16 #include "sharedsorabund.h"
17 #include "sharedjclass.h"
18 #include "sharedsorclass.h"
19 #include "sharedjest.h"
20 #include "sharedsorest.h"
21 #include "sharedthetayc.h"
22 #include "sharedthetan.h"
23 #include "sharedkstest.h"
24 #include "whittaker.h"
25 #include "sharedochiai.h"
26 #include "sharedanderbergs.h"
27 #include "sharedkulczynski.h"
28 #include "sharedkulczynskicody.h"
29 #include "sharedlennon.h"
30 #include "sharedmorisitahorn.h"
31 #include "sharedbraycurtis.h"
32 #include "sharedjackknife.h"
33 #include "whittaker.h"
35 //**********************************************************************************************************************
36 vector<string> SummarySharedCommand::getValidParameters(){
38 string Array[] = {"label","calc","groups","all","outputdir","distance","inputdir", "processors"};
39 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
43 m->errorOut(e, "SummarySharedCommand", "getValidParameters");
47 //**********************************************************************************************************************
48 SummarySharedCommand::SummarySharedCommand(){
51 //initialize outputTypes
52 vector<string> tempOutNames;
53 outputTypes["summary"] = tempOutNames;
56 m->errorOut(e, "SummarySharedCommand", "SummarySharedCommand");
60 //**********************************************************************************************************************
61 vector<string> SummarySharedCommand::getRequiredParameters(){
63 vector<string> myArray;
67 m->errorOut(e, "SummarySharedCommand", "getRequiredParameters");
71 //**********************************************************************************************************************
72 vector<string> SummarySharedCommand::getRequiredFiles(){
74 string Array[] = {"shared"};
75 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
79 m->errorOut(e, "SummarySharedCommand", "getRequiredFiles");
83 //**********************************************************************************************************************
85 SummarySharedCommand::SummarySharedCommand(string option) {
87 globaldata = GlobalData::getInstance();
93 //allow user to run help
94 if(option == "help") { validCalculator = new ValidCalculators(); help(); abort = true; }
97 //valid paramters for this command
98 string Array[] = {"label","calc","groups","all","outputdir","distance","inputdir", "processors"};
99 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
101 OptionParser parser(option);
102 map<string, string> parameters = parser.getParameters();
104 ValidParameters validParameter;
106 //check to make sure all parameters are valid for command
107 for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
108 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
111 //make sure the user has already run the read.otu command
112 if (globaldata->getSharedFile() == "") {
113 m->mothurOut("You must read a list and a group, or a shared before you can use the summary.shared command."); m->mothurOutEndLine(); abort = true;
116 //initialize outputTypes
117 vector<string> tempOutNames;
118 outputTypes["summary"] = tempOutNames;
120 //if the user changes the output directory command factory will send this info to us in the output parameter
121 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
123 outputDir += m->hasPath(globaldata->getSharedFile()); //if user entered a file with a path then preserve it
126 //check for optional parameter and set defaults
127 // ...at some point should added some additional type checking...
128 label = validParameter.validFile(parameters, "label", false);
129 if (label == "not found") { label = ""; }
131 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
132 else { allLines = 1; }
135 //if the user has not specified any labels use the ones from read.otu
137 allLines = globaldata->allLines;
138 labels = globaldata->labels;
141 calc = validParameter.validFile(parameters, "calc", false);
142 if (calc == "not found") { calc = "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan"; }
144 if (calc == "default") { calc = "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan"; }
146 m->splitAtDash(calc, Estimators);
148 groups = validParameter.validFile(parameters, "groups", false);
149 if (groups == "not found") { groups = ""; }
151 m->splitAtDash(groups, Groups);
152 globaldata->Groups = Groups;
155 string temp = validParameter.validFile(parameters, "all", false); if (temp == "not found") { temp = "false"; }
156 all = m->isTrue(temp);
158 temp = validParameter.validFile(parameters, "distance", false); if (temp == "not found") { temp = "false"; }
159 createPhylip = m->isTrue(temp);
161 temp = validParameter.validFile(parameters, "processors", false); if(temp == "not found"){ temp = "1"; }
162 convert(temp, processors);
164 if (abort == false) {
166 validCalculator = new ValidCalculators();
169 for (i=0; i<Estimators.size(); i++) {
170 if (validCalculator->isValidCalculator("sharedsummary", Estimators[i]) == true) {
171 if (Estimators[i] == "sharedsobs") {
172 sumCalculators.push_back(new SharedSobsCS());
173 }else if (Estimators[i] == "sharedchao") {
174 sumCalculators.push_back(new SharedChao1());
175 }else if (Estimators[i] == "sharedace") {
176 sumCalculators.push_back(new SharedAce());
177 }else if (Estimators[i] == "jabund") {
178 sumCalculators.push_back(new JAbund());
179 }else if (Estimators[i] == "sorabund") {
180 sumCalculators.push_back(new SorAbund());
181 }else if (Estimators[i] == "jclass") {
182 sumCalculators.push_back(new Jclass());
183 }else if (Estimators[i] == "sorclass") {
184 sumCalculators.push_back(new SorClass());
185 }else if (Estimators[i] == "jest") {
186 sumCalculators.push_back(new Jest());
187 }else if (Estimators[i] == "sorest") {
188 sumCalculators.push_back(new SorEst());
189 }else if (Estimators[i] == "thetayc") {
190 sumCalculators.push_back(new ThetaYC());
191 }else if (Estimators[i] == "thetan") {
192 sumCalculators.push_back(new ThetaN());
193 }else if (Estimators[i] == "kstest") {
194 sumCalculators.push_back(new KSTest());
195 }else if (Estimators[i] == "sharednseqs") {
196 sumCalculators.push_back(new SharedNSeqs());
197 }else if (Estimators[i] == "ochiai") {
198 sumCalculators.push_back(new Ochiai());
199 }else if (Estimators[i] == "anderberg") {
200 sumCalculators.push_back(new Anderberg());
201 }else if (Estimators[i] == "kulczynski") {
202 sumCalculators.push_back(new Kulczynski());
203 }else if (Estimators[i] == "kulczynskicody") {
204 sumCalculators.push_back(new KulczynskiCody());
205 }else if (Estimators[i] == "lennon") {
206 sumCalculators.push_back(new Lennon());
207 }else if (Estimators[i] == "morisitahorn") {
208 sumCalculators.push_back(new MorHorn());
209 }else if (Estimators[i] == "braycurtis") {
210 sumCalculators.push_back(new BrayCurtis());
211 }else if (Estimators[i] == "whittaker") {
212 sumCalculators.push_back(new Whittaker());
221 catch(exception& e) {
222 m->errorOut(e, "SummarySharedCommand", "SummarySharedCommand");
227 //**********************************************************************************************************************
229 void SummarySharedCommand::help(){
231 m->mothurOut("The summary.shared command can only be executed after a successful read.otu command.\n");
232 m->mothurOut("The summary.shared command parameters are label, calc and all. No parameters are required.\n");
233 m->mothurOut("The summary.shared command should be in the following format: \n");
234 m->mothurOut("summary.shared(label=yourLabel, calc=yourEstimators, groups=yourGroups).\n");
235 m->mothurOut("Example summary.shared(label=unique-.01-.03, groups=B-C, calc=sharedchao-sharedace-jabund-sorensonabund-jclass-sorclass-jest-sorest-thetayc-thetan).\n");
236 validCalculator->printCalc("sharedsummary", cout);
237 m->mothurOut("The default value for calc is sharedsobs-sharedchao-sharedace-jabund-sorensonabund-jclass-sorclass-jest-sorest-thetayc-thetan\n");
238 m->mothurOut("The default value for groups is all the groups in your groupfile.\n");
239 m->mothurOut("The label parameter is used to analyze specific labels in your input.\n");
240 m->mothurOut("The all parameter is used to specify if you want the estimate of all your groups together. This estimate can only be made for sharedsobs and sharedchao calculators. The default is false.\n");
241 m->mothurOut("If you use sharedchao and run into memory issues, set all to false. \n");
242 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n");
243 m->mothurOut("Note: No spaces between parameter labels (i.e. label), '=' and parameters (i.e.yourLabel).\n\n");
245 catch(exception& e) {
246 m->errorOut(e, "SummarySharedCommand", "help");
251 //**********************************************************************************************************************
253 SummarySharedCommand::~SummarySharedCommand(){
254 if (abort == false) {
256 delete validCalculator;
260 //**********************************************************************************************************************
262 int SummarySharedCommand::execute(){
265 if (abort == true) { return 0; }
267 ofstream outputFileHandle, outAll;
268 string outputFileName = outputDir + m->getRootName(m->getSimpleName(globaldata->inputFileName)) + "shared.summary";
270 //if the users entered no valid calculators don't execute command
271 if (sumCalculators.size() == 0) { return 0; }
272 //check if any calcs can do multiples
275 for (int i = 0; i < sumCalculators.size(); i++) {
276 if (sumCalculators[i]->getMultiple() == true) { mult = true; }
282 read = new ReadOTUFile(globaldata->inputFileName);
283 read->read(&*globaldata);
285 input = globaldata->ginput;
286 lookup = input->getSharedRAbundVectors();
287 string lastLabel = lookup[0]->getLabel();
289 /******************************************************/
290 //output headings for files
291 /******************************************************/
292 //output estimator names as column headers
293 m->openOutputFile(outputFileName, outputFileHandle);
294 outputFileHandle << "label" <<'\t' << "comparison" << '\t';
295 for(int i=0;i<sumCalculators.size();i++){
296 outputFileHandle << '\t' << sumCalculators[i]->getName();
297 if (sumCalculators[i]->getCols() == 3) { outputFileHandle << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci"; }
299 outputFileHandle << endl;
300 outputFileHandle.close();
302 //create file and put column headers for multiple groups file
303 string outAllFileName = ((m->getRootName(globaldata->inputFileName)) + "sharedmultiple.summary");
305 m->openOutputFile(outAllFileName, outAll);
306 outputNames.push_back(outAllFileName);
308 outAll << "label" <<'\t' << "comparison" << '\t';
309 for(int i=0;i<sumCalculators.size();i++){
310 if (sumCalculators[i]->getMultiple() == true) {
311 outAll << '\t' << sumCalculators[i]->getName();
318 if (lookup.size() < 2) {
319 m->mothurOut("I cannot run the command without at least 2 valid groups.");
320 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
322 //close files and clean up
323 remove(outputFileName.c_str());
324 if (mult == true) { remove(outAllFileName.c_str()); }
326 //if you only have 2 groups you don't need a .sharedmultiple file
327 }else if ((lookup.size() == 2) && (mult == true)) {
329 remove(outAllFileName.c_str());
330 outputNames.pop_back();
333 if (m->control_pressed) {
334 if (mult) { remove(outAllFileName.c_str()); }
335 remove(outputFileName.c_str());
336 delete input; globaldata->ginput = NULL;
337 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
338 for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; }
339 globaldata->Groups.clear();
342 /******************************************************/
345 /******************************************************/
346 //comparison breakup to be used by different processes later
347 numGroups = globaldata->Groups.size();
348 lines.resize(processors);
349 for (int i = 0; i < processors; i++) {
350 lines[i].start = int (sqrt(float(i)/float(processors)) * numGroups);
351 lines[i].end = int (sqrt(float(i+1)/float(processors)) * numGroups);
353 /******************************************************/
355 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
356 set<string> processedLabels;
357 set<string> userLabels = labels;
359 //as long as you are not at the end of the file or done wih the lines you want
360 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
361 if (m->control_pressed) {
362 if (mult) { remove(outAllFileName.c_str()); }
363 remove(outputFileName.c_str());
364 delete input; globaldata->ginput = NULL;
365 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
366 for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; }
367 globaldata->Groups.clear();
372 if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
373 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
374 process(lookup, outputFileName, outAllFileName);
376 processedLabels.insert(lookup[0]->getLabel());
377 userLabels.erase(lookup[0]->getLabel());
380 if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
381 string saveLabel = lookup[0]->getLabel();
383 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
384 lookup = input->getSharedRAbundVectors(lastLabel);
386 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
387 process(lookup, outputFileName, outAllFileName);
389 processedLabels.insert(lookup[0]->getLabel());
390 userLabels.erase(lookup[0]->getLabel());
392 //restore real lastlabel to save below
393 lookup[0]->setLabel(saveLabel);
396 lastLabel = lookup[0]->getLabel();
398 //get next line to process
399 //prevent memory leak
400 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
401 lookup = input->getSharedRAbundVectors();
404 if (m->control_pressed) {
405 if (mult) { remove(outAllFileName.c_str()); }
406 remove(outputFileName.c_str());
407 delete input; globaldata->ginput = NULL;
408 for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; }
409 globaldata->Groups.clear();
413 //output error messages about any remaining user labels
414 set<string>::iterator it;
415 bool needToRun = false;
416 for (it = userLabels.begin(); it != userLabels.end(); it++) {
417 m->mothurOut("Your file does not include the label " + *it);
418 if (processedLabels.count(lastLabel) != 1) {
419 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
422 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
426 //run last label if you need to
427 if (needToRun == true) {
428 for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
429 lookup = input->getSharedRAbundVectors(lastLabel);
431 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
432 process(lookup, outputFileName, outAllFileName);
433 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
437 //reset groups parameter
438 globaldata->Groups.clear();
440 for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; }
441 delete input; globaldata->ginput = NULL;
443 if (m->control_pressed) {
444 remove(outAllFileName.c_str());
445 remove(outputFileName.c_str());
449 m->mothurOutEndLine();
450 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
451 m->mothurOut(outputFileName); m->mothurOutEndLine();
452 if (mult) { m->mothurOut(outAllFileName); m->mothurOutEndLine(); outputTypes["summary"].push_back(outAllFileName); }
453 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } outputTypes["summary"].push_back(outputFileName);
454 m->mothurOutEndLine();
458 catch(exception& e) {
459 m->errorOut(e, "SummarySharedCommand", "execute");
464 /***********************************************************/
465 int SummarySharedCommand::process(vector<SharedRAbundVector*> thisLookup, string sumFileName, string sumAllFileName) {
467 vector< vector<seqDist> > calcDists; //vector containing vectors that contains the summary results for each group compare
468 calcDists.resize(sumCalculators.size()); //one for each calc, this will be used to make .dist files
470 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
472 driver(thisLookup, 0, numGroups, sumFileName+".temp", sumAllFileName+".temp", calcDists);
473 m->appendFiles((sumFileName + ".temp"), sumFileName);
474 remove((sumFileName + ".temp").c_str());
476 m->appendFiles((sumAllFileName + ".temp"), sumAllFileName);
477 remove((sumAllFileName + ".temp").c_str());
481 vector<int> processIDS;
483 //loop through and create all the processes you want
484 while (process != processors) {
488 processIDS.push_back(pid);
491 driver(thisLookup, lines[process].start, lines[process].end, sumFileName + toString(getpid()) + ".temp", sumAllFileName + toString(getpid()) + ".temp", calcDists);
493 //only do this if you want a distance file
495 string tempdistFileName = m->getRootName(m->getSimpleName(sumFileName)) + toString(getpid()) + ".dist";
497 m->openOutputFile(tempdistFileName, outtemp);
499 for (int i = 0; i < calcDists.size(); i++) {
500 outtemp << calcDists[i].size() << endl;
502 for (int j = 0; j < calcDists[i].size(); j++) {
503 outtemp << calcDists[i][j].seq1 << '\t' << calcDists[i][j].seq2 << '\t' << calcDists[i][j].dist << endl;
510 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
513 //parent do your part
514 driver(thisLookup, lines[0].start, lines[0].end, sumFileName + toString(getpid()) + ".temp", sumAllFileName + toString(getpid()) + ".temp", calcDists);
515 m->appendFiles((sumFileName + toString(getpid()) + ".temp"), sumFileName);
516 remove((sumFileName + toString(getpid()) + ".temp").c_str());
517 if (mult) { m->appendFiles((sumAllFileName + toString(getpid()) + ".temp"), sumAllFileName); }
519 //force parent to wait until all the processes are done
520 for (int i = 0; i < processIDS.size(); i++) {
521 int temp = processIDS[i];
525 for (int i = 0; i < processIDS.size(); i++) {
526 m->appendFiles((sumFileName + toString(processIDS[i]) + ".temp"), sumFileName);
527 remove((sumFileName + toString(processIDS[i]) + ".temp").c_str());
528 if (mult) { remove((sumAllFileName + toString(processIDS[i]) + ".temp").c_str()); }
531 string tempdistFileName = m->getRootName(m->getSimpleName(sumFileName)) + toString(processIDS[i]) + ".dist";
533 m->openInputFile(tempdistFileName, intemp);
535 for (int i = 0; i < calcDists.size(); i++) {
537 intemp >> size; m->gobble(intemp);
539 for (int j = 0; j < size; j++) {
544 intemp >> seq1 >> seq2 >> dist; m->gobble(intemp);
546 seqDist tempDist(seq1, seq2, dist);
547 calcDists[i].push_back(tempDist);
551 remove(tempdistFileName.c_str());
557 driver(thisLookup, 0, numGroups, (sumFileName + ".temp"), (sumAllFileName + ".temp"), calcDists);
558 m->appendFiles((sumFileName + ".temp"), sumFileName);
559 remove((sumFileName + ".temp").c_str());
561 m->appendFiles((sumAllFileName + ".temp"), sumAllFileName);
562 remove((sumAllFileName + ".temp").c_str());
567 for (int i = 0; i < calcDists.size(); i++) {
568 if (m->control_pressed) { break; }
570 string distFileName = outputDir + m->getRootName(m->getSimpleName(sumFileName)) + sumCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + ".dist";
571 outputNames.push_back(distFileName);
573 m->openOutputFile(distFileName, outDist);
574 outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
577 vector< vector<float> > matrix; //square matrix to represent the distance
578 matrix.resize(thisLookup.size());
579 for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); }
582 for (int j = 0; j < calcDists[i].size(); j++) {
583 int row = calcDists[i][j].seq1;
584 int column = calcDists[i][j].seq2;
585 float dist = calcDists[i][j].dist;
587 matrix[row][column] = dist;
588 matrix[column][row] = dist;
592 outDist << thisLookup.size() << endl;
593 for (int r=0; r<thisLookup.size(); r++) {
595 string name = thisLookup[r]->getGroup();
596 if (name.length() < 10) { //pad with spaces to make compatible
597 while (name.length() < 10) { name += " "; }
599 outDist << name << '\t';
602 for (int l = 0; l < r; l++) { outDist << matrix[r][l] << '\t'; }
610 catch(exception& e) {
611 m->errorOut(e, "SummarySharedCommand", "process");
615 /**************************************************************************************************/
616 int SummarySharedCommand::driver(vector<SharedRAbundVector*> thisLookup, int start, int end, string sumFile, string sumAllFile, vector< vector<seqDist> >& calcDists) {
619 //loop through calculators and add to file all for all calcs that can do mutiple groups
622 m->openOutputFile(sumAllFile, outAll);
625 outAll << thisLookup[0]->getLabel() << '\t';
627 //output groups names
628 string outNames = "";
629 for (int j = 0; j < thisLookup.size(); j++) {
630 outNames += thisLookup[j]->getGroup() + "-";
632 outNames = outNames.substr(0, outNames.length()-1); //rip off extra '-';
633 outAll << outNames << '\t';
635 for(int i=0;i<sumCalculators.size();i++){
636 if (sumCalculators[i]->getMultiple() == true) {
637 sumCalculators[i]->getValues(thisLookup);
639 if (m->control_pressed) { outAll.close(); return 1; }
642 sumCalculators[i]->print(outAll);
649 ofstream outputFileHandle;
650 m->openOutputFile(sumFile, outputFileHandle);
652 vector<SharedRAbundVector*> subset;
653 for (int k = start; k < end; k++) { // pass cdd each set of groups to compare
655 for (int l = 0; l < k; l++) {
657 outputFileHandle << thisLookup[0]->getLabel() << '\t';
659 subset.clear(); //clear out old pair of sharedrabunds
660 //add new pair of sharedrabunds
661 subset.push_back(thisLookup[k]); subset.push_back(thisLookup[l]);
663 //sort groups to be alphanumeric
664 if (thisLookup[k]->getGroup() > thisLookup[l]->getGroup()) {
665 outputFileHandle << (thisLookup[l]->getGroup() +'\t' + thisLookup[k]->getGroup()) << '\t'; //print out groups
667 outputFileHandle << (thisLookup[k]->getGroup() +'\t' + thisLookup[l]->getGroup()) << '\t'; //print out groups
670 for(int i=0;i<sumCalculators.size();i++) {
672 vector<double> tempdata = sumCalculators[i]->getValues(subset); //saves the calculator outputs
674 if (m->control_pressed) { outputFileHandle.close(); return 1; }
676 outputFileHandle << '\t';
677 sumCalculators[i]->print(outputFileHandle);
679 seqDist temp(l, k, tempdata[0]);
680 calcDists[i].push_back(temp);
682 outputFileHandle << endl;
686 outputFileHandle.close();
690 catch(exception& e) {
691 m->errorOut(e, "SummarySharedCommand", "driver");
695 /**************************************************************************************************/