2 * normalizesharedcommand.cpp
5 * Created by westcott on 9/15/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "normalizesharedcommand.h"
12 //**********************************************************************************************************************
13 vector<string> NormalizeSharedCommand::getValidParameters(){
15 string Array[] = {"groups","label","method","makerelabund","outputdir","inputdir","norm"};
16 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
20 m->errorOut(e, "NormalizeSharedCommand", "getValidParameters");
24 //**********************************************************************************************************************
25 NormalizeSharedCommand::NormalizeSharedCommand(){
27 abort = true; calledHelp = true;
28 vector<string> tempOutNames;
29 outputTypes["shared"] = tempOutNames;
32 m->errorOut(e, "NormalizeSharedCommand", "NormalizeSharedCommand");
36 //**********************************************************************************************************************
37 vector<string> NormalizeSharedCommand::getRequiredParameters(){
39 vector<string> myArray;
43 m->errorOut(e, "NormalizeSharedCommand", "getRequiredParameters");
47 //**********************************************************************************************************************
48 vector<string> NormalizeSharedCommand::getRequiredFiles(){
50 string Array[] = {"shared"};
51 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
55 m->errorOut(e, "NormalizeSharedCommand", "getRequiredFiles");
59 //**********************************************************************************************************************
61 NormalizeSharedCommand::NormalizeSharedCommand(string option) {
63 globaldata = GlobalData::getInstance();
64 abort = false; calledHelp = false;
68 //allow user to run help
69 if(option == "help") { help(); abort = true; calledHelp = true; }
72 //valid paramters for this command
73 string AlignArray[] = {"groups","label","method","makerelabund","outputdir","inputdir","norm"};
74 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
76 OptionParser parser(option);
77 map<string,string> parameters = parser.getParameters();
79 ValidParameters validParameter;
81 //check to make sure all parameters are valid for command
82 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
83 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
86 //initialize outputTypes
87 vector<string> tempOutNames;
88 outputTypes["shared"] = tempOutNames;
90 //if the user changes the output directory command factory will send this info to us in the output parameter
91 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
93 outputDir += m->hasPath(globaldata->inputFileName); //if user entered a file with a path then preserve it
96 //make sure the user has already run the read.otu command
97 if ((globaldata->getSharedFile() == "") && (globaldata->getRelAbundFile() == "")) {
98 m->mothurOut("You must read a list and a group, shared or relabund file before you can use the normalize.shared command."); m->mothurOutEndLine(); abort = true;
101 if ((globaldata->getSharedFile() != "") && (globaldata->getRelAbundFile() != "")) {
102 m->mothurOut("You may not use both a shared and relabund file as input for normalize.shared command."); m->mothurOutEndLine(); abort = true;
106 //check for optional parameter and set defaults
107 // ...at some point should added some additional type checking...
108 label = validParameter.validFile(parameters, "label", false);
109 if (label == "not found") { label = ""; }
111 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
112 else { allLines = 1; }
115 //if the user has not specified any labels use the ones from read.otu
117 allLines = globaldata->allLines;
118 labels = globaldata->labels;
121 groups = validParameter.validFile(parameters, "groups", false);
122 if (groups == "not found") { groups = ""; pickedGroups = false; }
125 m->splitAtDash(groups, Groups);
126 globaldata->Groups = Groups;
129 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "totalgroup"; }
130 if ((method != "totalgroup") && (method != "zscore")) { m->mothurOut(method + " is not a valid scaling option for the normalize.shared command. The options are totalgroup and zscore. We hope to add more ways to normalize in the future, suggestions are welcome!"); m->mothurOutEndLine(); abort = true; }
132 string temp = validParameter.validFile(parameters, "norm", false);
133 if (temp == "not found") {
134 norm = 0; //once you have read, set norm to smallest group number
137 if (norm < 0) { m->mothurOut("norm must be positive."); m->mothurOutEndLine(); abort=true; }
140 temp = validParameter.validFile(parameters, "makerelabund", false); if (temp == "") { temp = "f"; }
141 makeRelabund = m->isTrue(temp);
143 if ((globaldata->getFormat() != "sharedfile") && makeRelabund) { m->mothurOut("makerelabund can only be used with a shared file."); m->mothurOutEndLine(); }
148 catch(exception& e) {
149 m->errorOut(e, "NormalizeSharedCommand", "NormalizeSharedCommand");
154 //**********************************************************************************************************************
156 void NormalizeSharedCommand::help(){
158 m->mothurOut("The normalize.shared command can only be executed after a successful read.otu command of a list and group, shared or relabund file.\n");
159 m->mothurOut("The normalize.shared command parameters are groups, method, norm, makerelabund and label. No parameters are required.\n");
160 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n");
161 m->mothurOut("The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n");
162 m->mothurOut("The method parameter allows you to select what method you would like to use to normalize. The options are totalgroup and zscore. We hope to add more ways to normalize in the future, suggestions are welcome!\n");
163 m->mothurOut("The makerelabund parameter allows you to convert a shared file to a relabund file before you normalize. default=f.\n");
164 m->mothurOut("The norm parameter allows you to number you would like to normalize to. By default this is set to the number of sequences in your smallest group.\n");
165 m->mothurOut("The normalize.shared command should be in the following format: normalize.shared(groups=yourGroups, label=yourLabels).\n");
166 m->mothurOut("Example normalize.shared(groups=A-B-C, scale=totalgroup).\n");
167 m->mothurOut("The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n");
168 m->mothurOut("The normalize.shared command outputs a .norm.shared file.\n");
169 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
172 catch(exception& e) {
173 m->errorOut(e, "NormalizeSharedCommand", "help");
178 //**********************************************************************************************************************
180 NormalizeSharedCommand::~NormalizeSharedCommand(){}
182 //**********************************************************************************************************************
184 int NormalizeSharedCommand::execute(){
187 if (abort == true) { if (calledHelp) { return 0; } return 2; }
189 string outputFileName = outputDir + m->getRootName(m->getSimpleName(globaldata->inputFileName)) + "norm.shared";
191 m->openOutputFile(outputFileName, out);
193 if (globaldata->getFormat() == "sharedfile") { input = new InputData(globaldata->inputFileName, "sharedfile"); }
194 else { input = new InputData(globaldata->inputFileName, "relabund"); }
196 //you are reading a sharedfile and you do not want to make relabund
197 if ((globaldata->getFormat() == "sharedfile") && (!makeRelabund)) {
198 lookup = input->getSharedRAbundVectors();
199 string lastLabel = lookup[0]->getLabel();
201 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
202 set<string> processedLabels;
203 set<string> userLabels = labels;
205 if (method == "totalgroup") {
206 //set norm to smallest group number
208 norm = lookup[0]->getNumSeqs();
209 for (int i = 1; i < lookup.size(); i++) {
210 if (lookup[i]->getNumSeqs() < norm) { norm = lookup[i]->getNumSeqs(); }
214 m->mothurOut("Normalizing to " + toString(norm) + "."); m->mothurOutEndLine();
217 //as long as you are not at the end of the file or done wih the lines you want
218 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
220 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } globaldata->Groups.clear(); out.close(); remove(outputFileName.c_str()); return 0; }
222 if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
224 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
225 normalize(lookup, out);
227 processedLabels.insert(lookup[0]->getLabel());
228 userLabels.erase(lookup[0]->getLabel());
231 if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
232 string saveLabel = lookup[0]->getLabel();
234 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
235 lookup = input->getSharedRAbundVectors(lastLabel);
236 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
238 normalize(lookup, out);
240 processedLabels.insert(lookup[0]->getLabel());
241 userLabels.erase(lookup[0]->getLabel());
243 //restore real lastlabel to save below
244 lookup[0]->setLabel(saveLabel);
247 lastLabel = lookup[0]->getLabel();
248 //prevent memory leak
249 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; }
251 if (m->control_pressed) { outputTypes.clear(); globaldata->Groups.clear(); out.close(); remove(outputFileName.c_str()); return 0; }
253 //get next line to process
254 lookup = input->getSharedRAbundVectors();
257 if (m->control_pressed) { outputTypes.clear(); globaldata->Groups.clear(); out.close(); remove(outputFileName.c_str()); return 0; }
259 //output error messages about any remaining user labels
260 set<string>::iterator it;
261 bool needToRun = false;
262 for (it = userLabels.begin(); it != userLabels.end(); it++) {
263 m->mothurOut("Your file does not include the label " + *it);
264 if (processedLabels.count(lastLabel) != 1) {
265 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
268 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
272 //run last label if you need to
273 if (needToRun == true) {
274 for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
275 lookup = input->getSharedRAbundVectors(lastLabel);
277 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
279 normalize(lookup, out);
281 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
284 }else{ //relabund values
285 lookupFloat = input->getSharedRAbundFloatVectors();
286 string lastLabel = lookupFloat[0]->getLabel();
288 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
289 set<string> processedLabels;
290 set<string> userLabels = labels;
292 //set norm to smallest group number
293 if (method == "totalgroup") {
295 norm = lookupFloat[0]->getNumSeqs();
296 for (int i = 1; i < lookupFloat.size(); i++) {
297 if (lookupFloat[i]->getNumSeqs() < norm) { norm = lookupFloat[i]->getNumSeqs(); }
301 m->mothurOut("Normalizing to " + toString(norm) + "."); m->mothurOutEndLine();
304 //as long as you are not at the end of the file or done wih the lines you want
305 while((lookupFloat[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
307 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; } globaldata->Groups.clear(); out.close(); remove(outputFileName.c_str()); return 0; }
309 if(allLines == 1 || labels.count(lookupFloat[0]->getLabel()) == 1){
311 m->mothurOut(lookupFloat[0]->getLabel()); m->mothurOutEndLine();
312 normalize(lookupFloat, out);
314 processedLabels.insert(lookupFloat[0]->getLabel());
315 userLabels.erase(lookupFloat[0]->getLabel());
318 if ((m->anyLabelsToProcess(lookupFloat[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
319 string saveLabel = lookupFloat[0]->getLabel();
321 for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; }
322 lookupFloat = input->getSharedRAbundFloatVectors(lastLabel);
323 m->mothurOut(lookupFloat[0]->getLabel()); m->mothurOutEndLine();
325 normalize(lookupFloat, out);
327 processedLabels.insert(lookupFloat[0]->getLabel());
328 userLabels.erase(lookupFloat[0]->getLabel());
330 //restore real lastlabel to save below
331 lookupFloat[0]->setLabel(saveLabel);
334 lastLabel = lookupFloat[0]->getLabel();
335 //prevent memory leak
336 for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; lookupFloat[i] = NULL; }
338 if (m->control_pressed) { outputTypes.clear(); globaldata->Groups.clear(); out.close(); remove(outputFileName.c_str()); return 0; }
340 //get next line to process
341 lookupFloat = input->getSharedRAbundFloatVectors();
344 if (m->control_pressed) { outputTypes.clear(); globaldata->Groups.clear(); out.close(); remove(outputFileName.c_str()); return 0; }
346 //output error messages about any remaining user labels
347 set<string>::iterator it;
348 bool needToRun = false;
349 for (it = userLabels.begin(); it != userLabels.end(); it++) {
350 m->mothurOut("Your file does not include the label " + *it);
351 if (processedLabels.count(lastLabel) != 1) {
352 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
355 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
359 //run last label if you need to
360 if (needToRun == true) {
361 for (int i = 0; i < lookupFloat.size(); i++) { if (lookupFloat[i] != NULL) { delete lookupFloat[i]; } }
362 lookupFloat = input->getSharedRAbundFloatVectors(lastLabel);
364 m->mothurOut(lookupFloat[0]->getLabel()); m->mothurOutEndLine();
366 normalize(lookupFloat, out);
368 for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; }
372 //reset groups parameter
373 globaldata->Groups.clear();
377 if (m->control_pressed) { outputTypes.clear(); remove(outputFileName.c_str()); return 0;}
379 m->mothurOutEndLine();
380 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
381 m->mothurOut(outputFileName); m->mothurOutEndLine(); outputNames.push_back(outputFileName); outputTypes["shared"].push_back(outputFileName);
382 m->mothurOutEndLine();
386 catch(exception& e) {
387 m->errorOut(e, "NormalizeSharedCommand", "execute");
391 //**********************************************************************************************************************
393 int NormalizeSharedCommand::normalize(vector<SharedRAbundVector*>& thisLookUp, ofstream& out){
395 if (pickedGroups) { eliminateZeroOTUS(thisLookUp); }
397 if (method == "totalgroup") {
399 for (int j = 0; j < thisLookUp[0]->getNumBins(); j++) {
401 for (int i = 0; i < thisLookUp.size(); i++) {
403 if (m->control_pressed) { return 0; }
405 int abund = thisLookUp[i]->getAbundance(j);
407 float relabund = abund / (float) thisLookUp[i]->getNumSeqs();
408 float newNorm = relabund * norm;
410 //round to nearest int
411 int finalNorm = (int) floor((newNorm + 0.5));
413 thisLookUp[i]->set(j, finalNorm, thisLookUp[i]->getGroup());
417 }else if (method == "zscore") {
419 for (int j = 0; j < thisLookUp[0]->getNumBins(); j++) {
421 if (m->control_pressed) { return 0; }
425 for (int i = 0; i < thisLookUp.size(); i++) { mean += thisLookUp[i]->getAbundance(j); }
426 mean /= (float) thisLookUp.size();
428 //calc standard deviation
429 float sumSquared = 0.0;
430 for (int i = 0; i < thisLookUp.size(); i++) { sumSquared += (((float)thisLookUp[i]->getAbundance(j) - mean) * ((float)thisLookUp[i]->getAbundance(j) - mean)); }
431 sumSquared /= (float) thisLookUp.size();
433 float standardDev = sqrt(sumSquared);
435 for (int i = 0; i < thisLookUp.size(); i++) {
437 if (standardDev != 0) { // stop divide by zero
438 float newNorm = ((float)thisLookUp[i]->getAbundance(j) - mean) / standardDev;
439 //round to nearest int
440 finalNorm = (int) floor((newNorm + 0.5));
443 thisLookUp[i]->set(j, finalNorm, thisLookUp[i]->getGroup());
447 }else{ m->mothurOut(method + " is not a valid scaling option."); m->mothurOutEndLine(); m->control_pressed = true; return 0; }
451 eliminateZeroOTUS(thisLookUp);
453 for (int i = 0; i < thisLookUp.size(); i++) {
454 out << thisLookUp[i]->getLabel() << '\t' << thisLookUp[i]->getGroup() << '\t';
455 thisLookUp[i]->print(out);
460 catch(exception& e) {
461 m->errorOut(e, "NormalizeSharedCommand", "normalize");
465 //**********************************************************************************************************************
467 int NormalizeSharedCommand::normalize(vector<SharedRAbundFloatVector*>& thisLookUp, ofstream& out){
469 if (pickedGroups) { eliminateZeroOTUS(thisLookUp); }
471 if (method == "totalgroup") {
473 for (int j = 0; j < thisLookUp[0]->getNumBins(); j++) {
475 for (int i = 0; i < thisLookUp.size(); i++) {
477 if (m->control_pressed) { return 0; }
479 float abund = thisLookUp[i]->getAbundance(j);
481 float relabund = abund / (float) thisLookUp[i]->getNumSeqs();
482 float newNorm = relabund * norm;
484 thisLookUp[i]->set(j, newNorm, thisLookUp[i]->getGroup());
488 }else if (method == "zscore") {
489 for (int j = 0; j < thisLookUp[0]->getNumBins(); j++) {
491 if (m->control_pressed) { return 0; }
495 for (int i = 0; i < thisLookUp.size(); i++) { mean += thisLookUp[i]->getAbundance(j); }
496 mean /= (float) thisLookUp.size();
498 //calc standard deviation
499 float sumSquared = 0.0;
500 for (int i = 0; i < thisLookUp.size(); i++) { sumSquared += ((thisLookUp[i]->getAbundance(j) - mean) * (thisLookUp[i]->getAbundance(j) - mean)); }
501 sumSquared /= (float) thisLookUp.size();
503 float standardDev = sqrt(sumSquared);
505 for (int i = 0; i < thisLookUp.size(); i++) {
507 if (standardDev != 0) { // stop divide by zero
508 newNorm = (thisLookUp[i]->getAbundance(j) - mean) / standardDev;
510 thisLookUp[i]->set(j, newNorm, thisLookUp[i]->getGroup());
514 }else{ m->mothurOut(method + " is not a valid scaling option."); m->mothurOutEndLine(); m->control_pressed = true; return 0; }
517 eliminateZeroOTUS(thisLookUp);
519 for (int i = 0; i < thisLookUp.size(); i++) {
520 out << thisLookUp[i]->getLabel() << '\t' << thisLookUp[i]->getGroup() << '\t';
521 thisLookUp[i]->print(out);
526 catch(exception& e) {
527 m->errorOut(e, "NormalizeSharedCommand", "normalize");
531 //**********************************************************************************************************************
532 int NormalizeSharedCommand::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
535 vector<SharedRAbundVector*> newLookup;
536 for (int i = 0; i < thislookup.size(); i++) {
537 SharedRAbundVector* temp = new SharedRAbundVector();
538 temp->setLabel(thislookup[i]->getLabel());
539 temp->setGroup(thislookup[i]->getGroup());
540 newLookup.push_back(temp);
544 for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
545 if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
547 //look at each sharedRabund and make sure they are not all zero
549 for (int j = 0; j < thislookup.size(); j++) {
550 if (thislookup[j]->getAbundance(i) != 0) { allZero = false; break; }
553 //if they are not all zero add this bin
555 for (int j = 0; j < thislookup.size(); j++) {
556 newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
561 for (int j = 0; j < thislookup.size(); j++) { delete thislookup[j]; }
563 thislookup = newLookup;
568 catch(exception& e) {
569 m->errorOut(e, "NormalizeSharedCommand", "eliminateZeroOTUS");
573 //**********************************************************************************************************************
574 int NormalizeSharedCommand::eliminateZeroOTUS(vector<SharedRAbundFloatVector*>& thislookup) {
577 vector<SharedRAbundFloatVector*> newLookup;
578 for (int i = 0; i < thislookup.size(); i++) {
579 SharedRAbundFloatVector* temp = new SharedRAbundFloatVector();
580 temp->setLabel(thislookup[i]->getLabel());
581 temp->setGroup(thislookup[i]->getGroup());
582 newLookup.push_back(temp);
586 for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
587 if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
589 //look at each sharedRabund and make sure they are not all zero
591 for (int j = 0; j < thislookup.size(); j++) {
592 if (thislookup[j]->getAbundance(i) != 0) { allZero = false; break; }
595 //if they are not all zero add this bin
597 for (int j = 0; j < thislookup.size(); j++) {
598 newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
603 for (int j = 0; j < thislookup.size(); j++) { delete thislookup[j]; }
605 thislookup = newLookup;
610 catch(exception& e) {
611 m->errorOut(e, "NormalizeSharedCommand", "eliminateZeroOTUS");
616 //**********************************************************************************************************************