2 * summarysharedcommand.cpp
5 * Created by Sarah Westcott on 1/2/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "summarysharedcommand.h"
11 #include "sharedsobscollectsummary.h"
12 #include "sharedchao1.h"
13 #include "sharedace.h"
14 #include "sharednseqs.h"
15 #include "sharedjabund.h"
16 #include "sharedsorabund.h"
17 #include "sharedjclass.h"
18 #include "sharedsorclass.h"
19 #include "sharedjest.h"
20 #include "sharedsorest.h"
21 #include "sharedthetayc.h"
22 #include "sharedthetan.h"
23 #include "sharedkstest.h"
24 #include "whittaker.h"
25 #include "sharedochiai.h"
26 #include "sharedanderbergs.h"
27 #include "sharedkulczynski.h"
28 #include "sharedkulczynskicody.h"
29 #include "sharedlennon.h"
30 #include "sharedmorisitahorn.h"
31 #include "sharedbraycurtis.h"
32 #include "sharedjackknife.h"
33 #include "whittaker.h"
35 //**********************************************************************************************************************
36 vector<string> SummarySharedCommand::getValidParameters(){
38 string Array[] = {"label","calc","groups","all","outputdir","distance","inputdir", "processors"};
39 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
43 m->errorOut(e, "SummarySharedCommand", "getValidParameters");
47 //**********************************************************************************************************************
48 SummarySharedCommand::SummarySharedCommand(){
50 //initialize outputTypes
51 vector<string> tempOutNames;
52 outputTypes["summary"] = tempOutNames;
55 m->errorOut(e, "SummarySharedCommand", "SummarySharedCommand");
59 //**********************************************************************************************************************
60 vector<string> SummarySharedCommand::getRequiredParameters(){
62 vector<string> myArray;
66 m->errorOut(e, "SummarySharedCommand", "getRequiredParameters");
70 //**********************************************************************************************************************
71 vector<string> SummarySharedCommand::getRequiredFiles(){
73 string Array[] = {"shared"};
74 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
78 m->errorOut(e, "SummarySharedCommand", "getRequiredFiles");
82 //**********************************************************************************************************************
84 SummarySharedCommand::SummarySharedCommand(string option) {
86 globaldata = GlobalData::getInstance();
92 //allow user to run help
93 if(option == "help") { validCalculator = new ValidCalculators(); help(); abort = true; }
96 //valid paramters for this command
97 string Array[] = {"label","calc","groups","all","outputdir","distance","inputdir", "processors"};
98 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
100 OptionParser parser(option);
101 map<string, string> parameters = parser.getParameters();
103 ValidParameters validParameter;
105 //check to make sure all parameters are valid for command
106 for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
107 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
110 //make sure the user has already run the read.otu command
111 if (globaldata->getSharedFile() == "") {
112 m->mothurOut("You must read a list and a group, or a shared before you can use the summary.shared command."); m->mothurOutEndLine(); abort = true;
115 //initialize outputTypes
116 vector<string> tempOutNames;
117 outputTypes["summary"] = tempOutNames;
119 //if the user changes the output directory command factory will send this info to us in the output parameter
120 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
122 outputDir += m->hasPath(globaldata->getSharedFile()); //if user entered a file with a path then preserve it
125 //check for optional parameter and set defaults
126 // ...at some point should added some additional type checking...
127 label = validParameter.validFile(parameters, "label", false);
128 if (label == "not found") { label = ""; }
130 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
131 else { allLines = 1; }
134 //if the user has not specified any labels use the ones from read.otu
136 allLines = globaldata->allLines;
137 labels = globaldata->labels;
140 calc = validParameter.validFile(parameters, "calc", false);
141 if (calc == "not found") { calc = "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan"; }
143 if (calc == "default") { calc = "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan"; }
145 m->splitAtDash(calc, Estimators);
147 groups = validParameter.validFile(parameters, "groups", false);
148 if (groups == "not found") { groups = ""; }
150 m->splitAtDash(groups, Groups);
151 globaldata->Groups = Groups;
154 string temp = validParameter.validFile(parameters, "all", false); if (temp == "not found") { temp = "false"; }
155 all = m->isTrue(temp);
157 temp = validParameter.validFile(parameters, "distance", false); if (temp == "not found") { temp = "false"; }
158 createPhylip = m->isTrue(temp);
160 temp = validParameter.validFile(parameters, "processors", false); if(temp == "not found"){ temp = "1"; }
161 convert(temp, processors);
163 if (abort == false) {
165 validCalculator = new ValidCalculators();
168 for (i=0; i<Estimators.size(); i++) {
169 if (validCalculator->isValidCalculator("sharedsummary", Estimators[i]) == true) {
170 if (Estimators[i] == "sharedsobs") {
171 sumCalculators.push_back(new SharedSobsCS());
172 }else if (Estimators[i] == "sharedchao") {
173 sumCalculators.push_back(new SharedChao1());
174 }else if (Estimators[i] == "sharedace") {
175 sumCalculators.push_back(new SharedAce());
176 }else if (Estimators[i] == "jabund") {
177 sumCalculators.push_back(new JAbund());
178 }else if (Estimators[i] == "sorabund") {
179 sumCalculators.push_back(new SorAbund());
180 }else if (Estimators[i] == "jclass") {
181 sumCalculators.push_back(new Jclass());
182 }else if (Estimators[i] == "sorclass") {
183 sumCalculators.push_back(new SorClass());
184 }else if (Estimators[i] == "jest") {
185 sumCalculators.push_back(new Jest());
186 }else if (Estimators[i] == "sorest") {
187 sumCalculators.push_back(new SorEst());
188 }else if (Estimators[i] == "thetayc") {
189 sumCalculators.push_back(new ThetaYC());
190 }else if (Estimators[i] == "thetan") {
191 sumCalculators.push_back(new ThetaN());
192 }else if (Estimators[i] == "kstest") {
193 sumCalculators.push_back(new KSTest());
194 }else if (Estimators[i] == "sharednseqs") {
195 sumCalculators.push_back(new SharedNSeqs());
196 }else if (Estimators[i] == "ochiai") {
197 sumCalculators.push_back(new Ochiai());
198 }else if (Estimators[i] == "anderberg") {
199 sumCalculators.push_back(new Anderberg());
200 }else if (Estimators[i] == "kulczynski") {
201 sumCalculators.push_back(new Kulczynski());
202 }else if (Estimators[i] == "kulczynskicody") {
203 sumCalculators.push_back(new KulczynskiCody());
204 }else if (Estimators[i] == "lennon") {
205 sumCalculators.push_back(new Lennon());
206 }else if (Estimators[i] == "morisitahorn") {
207 sumCalculators.push_back(new MorHorn());
208 }else if (Estimators[i] == "braycurtis") {
209 sumCalculators.push_back(new BrayCurtis());
210 }else if (Estimators[i] == "whittaker") {
211 sumCalculators.push_back(new Whittaker());
220 catch(exception& e) {
221 m->errorOut(e, "SummarySharedCommand", "SummarySharedCommand");
226 //**********************************************************************************************************************
228 void SummarySharedCommand::help(){
230 m->mothurOut("The summary.shared command can only be executed after a successful read.otu command.\n");
231 m->mothurOut("The summary.shared command parameters are label, calc and all. No parameters are required.\n");
232 m->mothurOut("The summary.shared command should be in the following format: \n");
233 m->mothurOut("summary.shared(label=yourLabel, calc=yourEstimators, groups=yourGroups).\n");
234 m->mothurOut("Example summary.shared(label=unique-.01-.03, groups=B-C, calc=sharedchao-sharedace-jabund-sorensonabund-jclass-sorclass-jest-sorest-thetayc-thetan).\n");
235 validCalculator->printCalc("sharedsummary", cout);
236 m->mothurOut("The default value for calc is sharedsobs-sharedchao-sharedace-jabund-sorensonabund-jclass-sorclass-jest-sorest-thetayc-thetan\n");
237 m->mothurOut("The default value for groups is all the groups in your groupfile.\n");
238 m->mothurOut("The label parameter is used to analyze specific labels in your input.\n");
239 m->mothurOut("The all parameter is used to specify if you want the estimate of all your groups together. This estimate can only be made for sharedsobs and sharedchao calculators. The default is false.\n");
240 m->mothurOut("If you use sharedchao and run into memory issues, set all to false. \n");
241 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n");
242 m->mothurOut("Note: No spaces between parameter labels (i.e. label), '=' and parameters (i.e.yourLabel).\n\n");
244 catch(exception& e) {
245 m->errorOut(e, "SummarySharedCommand", "help");
250 //**********************************************************************************************************************
252 SummarySharedCommand::~SummarySharedCommand(){
253 if (abort == false) {
255 delete validCalculator;
259 //**********************************************************************************************************************
261 int SummarySharedCommand::execute(){
264 if (abort == true) { return 0; }
266 ofstream outputFileHandle, outAll;
267 string outputFileName = outputDir + m->getRootName(m->getSimpleName(globaldata->inputFileName)) + "shared.summary";
269 //if the users entered no valid calculators don't execute command
270 if (sumCalculators.size() == 0) { return 0; }
271 //check if any calcs can do multiples
274 for (int i = 0; i < sumCalculators.size(); i++) {
275 if (sumCalculators[i]->getMultiple() == true) { mult = true; }
281 read = new ReadOTUFile(globaldata->inputFileName);
282 read->read(&*globaldata);
284 input = globaldata->ginput;
285 lookup = input->getSharedRAbundVectors();
286 string lastLabel = lookup[0]->getLabel();
288 /******************************************************/
289 //output headings for files
290 /******************************************************/
291 //output estimator names as column headers
292 m->openOutputFile(outputFileName, outputFileHandle);
293 outputFileHandle << "label" <<'\t' << "comparison" << '\t';
294 for(int i=0;i<sumCalculators.size();i++){
295 outputFileHandle << '\t' << sumCalculators[i]->getName();
296 if (sumCalculators[i]->getCols() == 3) { outputFileHandle << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci"; }
298 outputFileHandle << endl;
299 outputFileHandle.close();
301 //create file and put column headers for multiple groups file
302 string outAllFileName = ((m->getRootName(globaldata->inputFileName)) + "sharedmultiple.summary");
304 m->openOutputFile(outAllFileName, outAll);
305 outputNames.push_back(outAllFileName);
307 outAll << "label" <<'\t' << "comparison" << '\t';
308 for(int i=0;i<sumCalculators.size();i++){
309 if (sumCalculators[i]->getMultiple() == true) {
310 outAll << '\t' << sumCalculators[i]->getName();
317 if (lookup.size() < 2) {
318 m->mothurOut("I cannot run the command without at least 2 valid groups.");
319 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
321 //close files and clean up
322 remove(outputFileName.c_str());
323 if (mult == true) { remove(outAllFileName.c_str()); }
325 //if you only have 2 groups you don't need a .sharedmultiple file
326 }else if ((lookup.size() == 2) && (mult == true)) {
328 remove(outAllFileName.c_str());
329 outputNames.pop_back();
332 if (m->control_pressed) {
333 if (mult) { remove(outAllFileName.c_str()); }
334 remove(outputFileName.c_str());
335 delete input; globaldata->ginput = NULL;
336 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
337 for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; }
338 globaldata->Groups.clear();
341 /******************************************************/
344 /******************************************************/
345 //comparison breakup to be used by different processes later
346 numGroups = globaldata->Groups.size();
347 lines.resize(processors);
348 for (int i = 0; i < processors; i++) {
349 lines[i].start = int (sqrt(float(i)/float(processors)) * numGroups);
350 lines[i].end = int (sqrt(float(i+1)/float(processors)) * numGroups);
352 /******************************************************/
354 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
355 set<string> processedLabels;
356 set<string> userLabels = labels;
358 //as long as you are not at the end of the file or done wih the lines you want
359 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
360 if (m->control_pressed) {
361 if (mult) { remove(outAllFileName.c_str()); }
362 remove(outputFileName.c_str());
363 delete input; globaldata->ginput = NULL;
364 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
365 for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; }
366 globaldata->Groups.clear();
371 if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
372 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
373 process(lookup, outputFileName, outAllFileName);
375 processedLabels.insert(lookup[0]->getLabel());
376 userLabels.erase(lookup[0]->getLabel());
379 if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
380 string saveLabel = lookup[0]->getLabel();
382 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
383 lookup = input->getSharedRAbundVectors(lastLabel);
385 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
386 process(lookup, outputFileName, outAllFileName);
388 processedLabels.insert(lookup[0]->getLabel());
389 userLabels.erase(lookup[0]->getLabel());
391 //restore real lastlabel to save below
392 lookup[0]->setLabel(saveLabel);
395 lastLabel = lookup[0]->getLabel();
397 //get next line to process
398 //prevent memory leak
399 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
400 lookup = input->getSharedRAbundVectors();
403 if (m->control_pressed) {
404 if (mult) { remove(outAllFileName.c_str()); }
405 remove(outputFileName.c_str());
406 delete input; globaldata->ginput = NULL;
407 for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; }
408 globaldata->Groups.clear();
412 //output error messages about any remaining user labels
413 set<string>::iterator it;
414 bool needToRun = false;
415 for (it = userLabels.begin(); it != userLabels.end(); it++) {
416 m->mothurOut("Your file does not include the label " + *it);
417 if (processedLabels.count(lastLabel) != 1) {
418 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
421 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
425 //run last label if you need to
426 if (needToRun == true) {
427 for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
428 lookup = input->getSharedRAbundVectors(lastLabel);
430 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
431 process(lookup, outputFileName, outAllFileName);
432 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
436 //reset groups parameter
437 globaldata->Groups.clear();
439 for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; }
440 delete input; globaldata->ginput = NULL;
442 if (m->control_pressed) {
443 remove(outAllFileName.c_str());
444 remove(outputFileName.c_str());
448 m->mothurOutEndLine();
449 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
450 m->mothurOut(outputFileName); m->mothurOutEndLine();
451 if (mult) { m->mothurOut(outAllFileName); m->mothurOutEndLine(); outputTypes["summary"].push_back(outAllFileName); }
452 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } outputTypes["summary"].push_back(outputFileName);
453 m->mothurOutEndLine();
457 catch(exception& e) {
458 m->errorOut(e, "SummarySharedCommand", "execute");
463 /***********************************************************/
464 int SummarySharedCommand::process(vector<SharedRAbundVector*> thisLookup, string sumFileName, string sumAllFileName) {
466 vector< vector<seqDist> > calcDists; //vector containing vectors that contains the summary results for each group compare
467 calcDists.resize(sumCalculators.size()); //one for each calc, this will be used to make .dist files
469 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
471 driver(thisLookup, 0, numGroups, sumFileName+".temp", sumAllFileName+".temp", calcDists);
472 m->appendFiles((sumFileName + ".temp"), sumFileName);
473 remove((sumFileName + ".temp").c_str());
475 m->appendFiles((sumAllFileName + ".temp"), sumAllFileName);
476 remove((sumAllFileName + ".temp").c_str());
480 vector<int> processIDS;
482 //loop through and create all the processes you want
483 while (process != processors) {
487 processIDS.push_back(pid);
490 driver(thisLookup, lines[process].start, lines[process].end, sumFileName + toString(getpid()) + ".temp", sumAllFileName + toString(getpid()) + ".temp", calcDists);
492 //only do this if you want a distance file
494 string tempdistFileName = m->getRootName(m->getSimpleName(sumFileName)) + toString(getpid()) + ".dist";
496 m->openOutputFile(tempdistFileName, outtemp);
498 for (int i = 0; i < calcDists.size(); i++) {
499 outtemp << calcDists[i].size() << endl;
501 for (int j = 0; j < calcDists[i].size(); j++) {
502 outtemp << calcDists[i][j].seq1 << '\t' << calcDists[i][j].seq2 << '\t' << calcDists[i][j].dist << endl;
509 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
512 //parent do your part
513 driver(thisLookup, lines[0].start, lines[0].end, sumFileName + toString(getpid()) + ".temp", sumAllFileName + toString(getpid()) + ".temp", calcDists);
514 m->appendFiles((sumFileName + toString(getpid()) + ".temp"), sumFileName);
515 remove((sumFileName + toString(getpid()) + ".temp").c_str());
516 if (mult) { m->appendFiles((sumAllFileName + toString(getpid()) + ".temp"), sumAllFileName); }
518 //force parent to wait until all the processes are done
519 for (int i = 0; i < processIDS.size(); i++) {
520 int temp = processIDS[i];
524 for (int i = 0; i < processIDS.size(); i++) {
525 m->appendFiles((sumFileName + toString(processIDS[i]) + ".temp"), sumFileName);
526 remove((sumFileName + toString(processIDS[i]) + ".temp").c_str());
527 if (mult) { remove((sumAllFileName + toString(processIDS[i]) + ".temp").c_str()); }
530 string tempdistFileName = m->getRootName(m->getSimpleName(sumFileName)) + toString(processIDS[i]) + ".dist";
532 m->openInputFile(tempdistFileName, intemp);
534 for (int i = 0; i < calcDists.size(); i++) {
536 intemp >> size; m->gobble(intemp);
538 for (int j = 0; j < size; j++) {
543 intemp >> seq1 >> seq2 >> dist; m->gobble(intemp);
545 seqDist tempDist(seq1, seq2, dist);
546 calcDists[i].push_back(tempDist);
550 remove(tempdistFileName.c_str());
556 driver(thisLookup, 0, numGroups, (sumFileName + ".temp"), (sumAllFileName + ".temp"), calcDists);
557 m->appendFiles((sumFileName + ".temp"), sumFileName);
558 remove((sumFileName + ".temp").c_str());
560 m->appendFiles((sumAllFileName + ".temp"), sumAllFileName);
561 remove((sumAllFileName + ".temp").c_str());
566 for (int i = 0; i < calcDists.size(); i++) {
567 if (m->control_pressed) { break; }
569 string distFileName = outputDir + m->getRootName(m->getSimpleName(sumFileName)) + sumCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + ".dist";
570 outputNames.push_back(distFileName);
572 m->openOutputFile(distFileName, outDist);
573 outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
576 vector< vector<float> > matrix; //square matrix to represent the distance
577 matrix.resize(thisLookup.size());
578 for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); }
581 for (int j = 0; j < calcDists[i].size(); j++) {
582 int row = calcDists[i][j].seq1;
583 int column = calcDists[i][j].seq2;
584 float dist = calcDists[i][j].dist;
586 matrix[row][column] = dist;
587 matrix[column][row] = dist;
591 outDist << thisLookup.size() << endl;
592 for (int r=0; r<thisLookup.size(); r++) {
594 string name = thisLookup[r]->getGroup();
595 if (name.length() < 10) { //pad with spaces to make compatible
596 while (name.length() < 10) { name += " "; }
598 outDist << name << '\t';
601 for (int l = 0; l < r; l++) { outDist << matrix[r][l] << '\t'; }
609 catch(exception& e) {
610 m->errorOut(e, "SummarySharedCommand", "process");
614 /**************************************************************************************************/
615 int SummarySharedCommand::driver(vector<SharedRAbundVector*> thisLookup, int start, int end, string sumFile, string sumAllFile, vector< vector<seqDist> >& calcDists) {
618 //loop through calculators and add to file all for all calcs that can do mutiple groups
621 m->openOutputFile(sumAllFile, outAll);
624 outAll << thisLookup[0]->getLabel() << '\t';
626 //output groups names
627 string outNames = "";
628 for (int j = 0; j < thisLookup.size(); j++) {
629 outNames += thisLookup[j]->getGroup() + "-";
631 outNames = outNames.substr(0, outNames.length()-1); //rip off extra '-';
632 outAll << outNames << '\t';
634 for(int i=0;i<sumCalculators.size();i++){
635 if (sumCalculators[i]->getMultiple() == true) {
636 sumCalculators[i]->getValues(thisLookup);
638 if (m->control_pressed) { outAll.close(); return 1; }
641 sumCalculators[i]->print(outAll);
648 ofstream outputFileHandle;
649 m->openOutputFile(sumFile, outputFileHandle);
651 vector<SharedRAbundVector*> subset;
652 for (int k = start; k < end; k++) { // pass cdd each set of groups to compare
654 for (int l = 0; l < k; l++) {
656 outputFileHandle << thisLookup[0]->getLabel() << '\t';
658 subset.clear(); //clear out old pair of sharedrabunds
659 //add new pair of sharedrabunds
660 subset.push_back(thisLookup[k]); subset.push_back(thisLookup[l]);
662 //sort groups to be alphanumeric
663 if (thisLookup[k]->getGroup() > thisLookup[l]->getGroup()) {
664 outputFileHandle << (thisLookup[l]->getGroup() +'\t' + thisLookup[k]->getGroup()) << '\t'; //print out groups
666 outputFileHandle << (thisLookup[k]->getGroup() +'\t' + thisLookup[l]->getGroup()) << '\t'; //print out groups
669 for(int i=0;i<sumCalculators.size();i++) {
671 vector<double> tempdata = sumCalculators[i]->getValues(subset); //saves the calculator outputs
673 if (m->control_pressed) { outputFileHandle.close(); return 1; }
675 outputFileHandle << '\t';
676 sumCalculators[i]->print(outputFileHandle);
678 seqDist temp(l, k, tempdata[0]);
679 calcDists[i].push_back(temp);
681 outputFileHandle << endl;
685 outputFileHandle.close();
689 catch(exception& e) {
690 m->errorOut(e, "SummarySharedCommand", "driver");
694 /**************************************************************************************************/