2 * normalizesharedcommand.cpp
5 * Created by westcott on 9/15/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "normalizesharedcommand.h"
12 //**********************************************************************************************************************
13 vector<string> NormalizeSharedCommand::getValidParameters(){
15 string Array[] = {"groups","label","method","makerelabund","outputdir","inputdir","norm"};
16 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
20 m->errorOut(e, "NormalizeSharedCommand", "getValidParameters");
24 //**********************************************************************************************************************
25 NormalizeSharedCommand::NormalizeSharedCommand(){
28 //initialize outputTypes
29 vector<string> tempOutNames;
30 outputTypes["shared"] = tempOutNames;
33 m->errorOut(e, "NormalizeSharedCommand", "NormalizeSharedCommand");
37 //**********************************************************************************************************************
38 vector<string> NormalizeSharedCommand::getRequiredParameters(){
40 vector<string> myArray;
44 m->errorOut(e, "NormalizeSharedCommand", "getRequiredParameters");
48 //**********************************************************************************************************************
49 vector<string> NormalizeSharedCommand::getRequiredFiles(){
51 string Array[] = {"shared"};
52 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
56 m->errorOut(e, "NormalizeSharedCommand", "getRequiredFiles");
60 //**********************************************************************************************************************
62 NormalizeSharedCommand::NormalizeSharedCommand(string option) {
64 globaldata = GlobalData::getInstance();
69 //allow user to run help
70 if(option == "help") { help(); abort = true; }
73 //valid paramters for this command
74 string AlignArray[] = {"groups","label","method","makerelabund","outputdir","inputdir","norm"};
75 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
77 OptionParser parser(option);
78 map<string,string> parameters = parser.getParameters();
80 ValidParameters validParameter;
82 //check to make sure all parameters are valid for command
83 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
84 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
87 //initialize outputTypes
88 vector<string> tempOutNames;
89 outputTypes["shared"] = tempOutNames;
91 //if the user changes the output directory command factory will send this info to us in the output parameter
92 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
94 outputDir += m->hasPath(globaldata->inputFileName); //if user entered a file with a path then preserve it
97 //make sure the user has already run the read.otu command
98 if ((globaldata->getSharedFile() == "") && (globaldata->getRelAbundFile() == "")) {
99 m->mothurOut("You must read a list and a group, shared or relabund file before you can use the normalize.shared command."); m->mothurOutEndLine(); abort = true;
102 if ((globaldata->getSharedFile() != "") && (globaldata->getRelAbundFile() != "")) {
103 m->mothurOut("You may not use both a shared and relabund file as input for normalize.shared command."); m->mothurOutEndLine(); abort = true;
107 //check for optional parameter and set defaults
108 // ...at some point should added some additional type checking...
109 label = validParameter.validFile(parameters, "label", false);
110 if (label == "not found") { label = ""; }
112 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
113 else { allLines = 1; }
116 //if the user has not specified any labels use the ones from read.otu
118 allLines = globaldata->allLines;
119 labels = globaldata->labels;
122 groups = validParameter.validFile(parameters, "groups", false);
123 if (groups == "not found") { groups = ""; pickedGroups = false; }
126 m->splitAtDash(groups, Groups);
127 globaldata->Groups = Groups;
130 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "totalgroup"; }
131 if ((method != "totalgroup") && (method != "zscore")) { m->mothurOut(method + " is not a valid scaling option for the normalize.shared command. The options are totalgroup and zscore. We hope to add more ways to normalize in the future, suggestions are welcome!"); m->mothurOutEndLine(); abort = true; }
133 string temp = validParameter.validFile(parameters, "norm", false);
134 if (temp == "not found") {
135 norm = 0; //once you have read, set norm to smallest group number
138 if (norm < 0) { m->mothurOut("norm must be positive."); m->mothurOutEndLine(); abort=true; }
141 temp = validParameter.validFile(parameters, "makerelabund", false); if (temp == "") { temp = "f"; }
142 makeRelabund = m->isTrue(temp);
144 if ((globaldata->getFormat() != "sharedfile") && makeRelabund) { m->mothurOut("makerelabund can only be used with a shared file."); m->mothurOutEndLine(); }
149 catch(exception& e) {
150 m->errorOut(e, "NormalizeSharedCommand", "NormalizeSharedCommand");
155 //**********************************************************************************************************************
157 void NormalizeSharedCommand::help(){
159 m->mothurOut("The normalize.shared command can only be executed after a successful read.otu command of a list and group, shared or relabund file.\n");
160 m->mothurOut("The normalize.shared command parameters are groups, method, norm, makerelabund and label. No parameters are required.\n");
161 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n");
162 m->mothurOut("The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n");
163 m->mothurOut("The method parameter allows you to select what method you would like to use to normalize. The options are totalgroup and zscore. We hope to add more ways to normalize in the future, suggestions are welcome!\n");
164 m->mothurOut("The makerelabund parameter allows you to convert a shared file to a relabund file before you normalize. default=f.\n");
165 m->mothurOut("The norm parameter allows you to number you would like to normalize to. By default this is set to the number of sequences in your smallest group.\n");
166 m->mothurOut("The normalize.shared command should be in the following format: normalize.shared(groups=yourGroups, label=yourLabels).\n");
167 m->mothurOut("Example normalize.shared(groups=A-B-C, scale=totalgroup).\n");
168 m->mothurOut("The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n");
169 m->mothurOut("The normalize.shared command outputs a .norm.shared file.\n");
170 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
173 catch(exception& e) {
174 m->errorOut(e, "NormalizeSharedCommand", "help");
179 //**********************************************************************************************************************
181 NormalizeSharedCommand::~NormalizeSharedCommand(){}
183 //**********************************************************************************************************************
185 int NormalizeSharedCommand::execute(){
188 if (abort == true) { return 0; }
190 string outputFileName = outputDir + m->getRootName(m->getSimpleName(globaldata->inputFileName)) + "norm.shared";
192 m->openOutputFile(outputFileName, out);
194 if (globaldata->getFormat() == "sharedfile") { input = new InputData(globaldata->inputFileName, "sharedfile"); }
195 else { input = new InputData(globaldata->inputFileName, "relabund"); }
197 //you are reading a sharedfile and you do not want to make relabund
198 if ((globaldata->getFormat() == "sharedfile") && (!makeRelabund)) {
199 lookup = input->getSharedRAbundVectors();
200 string lastLabel = lookup[0]->getLabel();
202 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
203 set<string> processedLabels;
204 set<string> userLabels = labels;
206 if (method == "totalgroup") {
207 //set norm to smallest group number
209 norm = lookup[0]->getNumSeqs();
210 for (int i = 1; i < lookup.size(); i++) {
211 if (lookup[i]->getNumSeqs() < norm) { norm = lookup[i]->getNumSeqs(); }
215 m->mothurOut("Normalizing to " + toString(norm) + "."); m->mothurOutEndLine();
218 //as long as you are not at the end of the file or done wih the lines you want
219 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
221 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } globaldata->Groups.clear(); out.close(); remove(outputFileName.c_str()); return 0; }
223 if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
225 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
226 normalize(lookup, out);
228 processedLabels.insert(lookup[0]->getLabel());
229 userLabels.erase(lookup[0]->getLabel());
232 if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
233 string saveLabel = lookup[0]->getLabel();
235 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
236 lookup = input->getSharedRAbundVectors(lastLabel);
237 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
239 normalize(lookup, out);
241 processedLabels.insert(lookup[0]->getLabel());
242 userLabels.erase(lookup[0]->getLabel());
244 //restore real lastlabel to save below
245 lookup[0]->setLabel(saveLabel);
248 lastLabel = lookup[0]->getLabel();
249 //prevent memory leak
250 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; }
252 if (m->control_pressed) { outputTypes.clear(); globaldata->Groups.clear(); out.close(); remove(outputFileName.c_str()); return 0; }
254 //get next line to process
255 lookup = input->getSharedRAbundVectors();
258 if (m->control_pressed) { outputTypes.clear(); globaldata->Groups.clear(); out.close(); remove(outputFileName.c_str()); return 0; }
260 //output error messages about any remaining user labels
261 set<string>::iterator it;
262 bool needToRun = false;
263 for (it = userLabels.begin(); it != userLabels.end(); it++) {
264 m->mothurOut("Your file does not include the label " + *it);
265 if (processedLabels.count(lastLabel) != 1) {
266 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
269 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
273 //run last label if you need to
274 if (needToRun == true) {
275 for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
276 lookup = input->getSharedRAbundVectors(lastLabel);
278 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
280 normalize(lookup, out);
282 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
285 }else{ //relabund values
286 lookupFloat = input->getSharedRAbundFloatVectors();
287 string lastLabel = lookupFloat[0]->getLabel();
289 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
290 set<string> processedLabels;
291 set<string> userLabels = labels;
293 //set norm to smallest group number
294 if (method == "totalgroup") {
296 norm = lookupFloat[0]->getNumSeqs();
297 for (int i = 1; i < lookupFloat.size(); i++) {
298 if (lookupFloat[i]->getNumSeqs() < norm) { norm = lookupFloat[i]->getNumSeqs(); }
302 m->mothurOut("Normalizing to " + toString(norm) + "."); m->mothurOutEndLine();
305 //as long as you are not at the end of the file or done wih the lines you want
306 while((lookupFloat[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
308 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; } globaldata->Groups.clear(); out.close(); remove(outputFileName.c_str()); return 0; }
310 if(allLines == 1 || labels.count(lookupFloat[0]->getLabel()) == 1){
312 m->mothurOut(lookupFloat[0]->getLabel()); m->mothurOutEndLine();
313 normalize(lookupFloat, out);
315 processedLabels.insert(lookupFloat[0]->getLabel());
316 userLabels.erase(lookupFloat[0]->getLabel());
319 if ((m->anyLabelsToProcess(lookupFloat[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
320 string saveLabel = lookupFloat[0]->getLabel();
322 for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; }
323 lookupFloat = input->getSharedRAbundFloatVectors(lastLabel);
324 m->mothurOut(lookupFloat[0]->getLabel()); m->mothurOutEndLine();
326 normalize(lookupFloat, out);
328 processedLabels.insert(lookupFloat[0]->getLabel());
329 userLabels.erase(lookupFloat[0]->getLabel());
331 //restore real lastlabel to save below
332 lookupFloat[0]->setLabel(saveLabel);
335 lastLabel = lookupFloat[0]->getLabel();
336 //prevent memory leak
337 for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; lookupFloat[i] = NULL; }
339 if (m->control_pressed) { outputTypes.clear(); globaldata->Groups.clear(); out.close(); remove(outputFileName.c_str()); return 0; }
341 //get next line to process
342 lookupFloat = input->getSharedRAbundFloatVectors();
345 if (m->control_pressed) { outputTypes.clear(); globaldata->Groups.clear(); out.close(); remove(outputFileName.c_str()); return 0; }
347 //output error messages about any remaining user labels
348 set<string>::iterator it;
349 bool needToRun = false;
350 for (it = userLabels.begin(); it != userLabels.end(); it++) {
351 m->mothurOut("Your file does not include the label " + *it);
352 if (processedLabels.count(lastLabel) != 1) {
353 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
356 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
360 //run last label if you need to
361 if (needToRun == true) {
362 for (int i = 0; i < lookupFloat.size(); i++) { if (lookupFloat[i] != NULL) { delete lookupFloat[i]; } }
363 lookupFloat = input->getSharedRAbundFloatVectors(lastLabel);
365 m->mothurOut(lookupFloat[0]->getLabel()); m->mothurOutEndLine();
367 normalize(lookupFloat, out);
369 for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; }
373 //reset groups parameter
374 globaldata->Groups.clear();
378 if (m->control_pressed) { outputTypes.clear(); remove(outputFileName.c_str()); return 0;}
380 m->mothurOutEndLine();
381 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
382 m->mothurOut(outputFileName); m->mothurOutEndLine(); outputNames.push_back(outputFileName); outputTypes["shared"].push_back(outputFileName);
383 m->mothurOutEndLine();
387 catch(exception& e) {
388 m->errorOut(e, "NormalizeSharedCommand", "execute");
392 //**********************************************************************************************************************
394 int NormalizeSharedCommand::normalize(vector<SharedRAbundVector*>& thisLookUp, ofstream& out){
396 if (pickedGroups) { eliminateZeroOTUS(thisLookUp); }
398 if (method == "totalgroup") {
400 for (int j = 0; j < thisLookUp[0]->getNumBins(); j++) {
402 for (int i = 0; i < thisLookUp.size(); i++) {
404 if (m->control_pressed) { return 0; }
406 int abund = thisLookUp[i]->getAbundance(j);
408 float relabund = relabund = abund / (float) thisLookUp[i]->getNumSeqs();
409 float newNorm = relabund * norm;
411 //round to nearest int
412 int finalNorm = (int) floor((newNorm + 0.5));
414 thisLookUp[i]->set(j, finalNorm, thisLookUp[i]->getGroup());
418 }else if (method == "zscore") {
420 for (int j = 0; j < thisLookUp[0]->getNumBins(); j++) {
422 if (m->control_pressed) { return 0; }
426 for (int i = 0; i < thisLookUp.size(); i++) { mean += thisLookUp[i]->getAbundance(j); }
427 mean /= (float) thisLookUp.size();
429 //calc standard deviation
430 float sumSquared = 0.0;
431 for (int i = 0; i < thisLookUp.size(); i++) { sumSquared += (((float)thisLookUp[i]->getAbundance(j) - mean) * ((float)thisLookUp[i]->getAbundance(j) - mean)); }
432 sumSquared /= (float) thisLookUp.size();
434 float standardDev = sqrt(sumSquared);
436 for (int i = 0; i < thisLookUp.size(); i++) {
438 if (standardDev != 0) { // stop divide by zero
439 float newNorm = ((float)thisLookUp[i]->getAbundance(j) - mean) / standardDev;
440 //round to nearest int
441 finalNorm = (int) floor((newNorm + 0.5));
444 thisLookUp[i]->set(j, finalNorm, thisLookUp[i]->getGroup());
448 }else{ m->mothurOut(method + " is not a valid scaling option."); m->mothurOutEndLine(); m->control_pressed = true; return 0; }
452 eliminateZeroOTUS(thisLookUp);
454 for (int i = 0; i < thisLookUp.size(); i++) {
455 out << thisLookUp[i]->getLabel() << '\t' << thisLookUp[i]->getGroup() << '\t';
456 thisLookUp[i]->print(out);
461 catch(exception& e) {
462 m->errorOut(e, "NormalizeSharedCommand", "normalize");
466 //**********************************************************************************************************************
468 int NormalizeSharedCommand::normalize(vector<SharedRAbundFloatVector*>& thisLookUp, ofstream& out){
470 if (pickedGroups) { eliminateZeroOTUS(thisLookUp); }
472 if (method == "totalgroup") {
474 for (int j = 0; j < thisLookUp[0]->getNumBins(); j++) {
476 for (int i = 0; i < thisLookUp.size(); i++) {
478 if (m->control_pressed) { return 0; }
480 float abund = thisLookUp[i]->getAbundance(j);
482 float relabund = relabund = abund / (float) thisLookUp[i]->getNumSeqs();
483 float newNorm = relabund * norm;
485 thisLookUp[i]->set(j, newNorm, thisLookUp[i]->getGroup());
489 }else if (method == "zscore") {
490 for (int j = 0; j < thisLookUp[0]->getNumBins(); j++) {
492 if (m->control_pressed) { return 0; }
496 for (int i = 0; i < thisLookUp.size(); i++) { mean += thisLookUp[i]->getAbundance(j); }
497 mean /= (float) thisLookUp.size();
499 //calc standard deviation
500 float sumSquared = 0.0;
501 for (int i = 0; i < thisLookUp.size(); i++) { sumSquared += ((thisLookUp[i]->getAbundance(j) - mean) * (thisLookUp[i]->getAbundance(j) - mean)); }
502 sumSquared /= (float) thisLookUp.size();
504 float standardDev = sqrt(sumSquared);
506 for (int i = 0; i < thisLookUp.size(); i++) {
508 if (standardDev != 0) { // stop divide by zero
509 newNorm = (thisLookUp[i]->getAbundance(j) - mean) / standardDev;
511 thisLookUp[i]->set(j, newNorm, thisLookUp[i]->getGroup());
515 }else{ m->mothurOut(method + " is not a valid scaling option."); m->mothurOutEndLine(); m->control_pressed = true; return 0; }
518 eliminateZeroOTUS(thisLookUp);
520 for (int i = 0; i < thisLookUp.size(); i++) {
521 out << thisLookUp[i]->getLabel() << '\t' << thisLookUp[i]->getGroup() << '\t';
522 thisLookUp[i]->print(out);
527 catch(exception& e) {
528 m->errorOut(e, "NormalizeSharedCommand", "normalize");
532 //**********************************************************************************************************************
533 int NormalizeSharedCommand::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
536 vector<SharedRAbundVector*> newLookup;
537 for (int i = 0; i < thislookup.size(); i++) {
538 SharedRAbundVector* temp = new SharedRAbundVector();
539 temp->setLabel(thislookup[i]->getLabel());
540 temp->setGroup(thislookup[i]->getGroup());
541 newLookup.push_back(temp);
545 for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
546 if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
548 //look at each sharedRabund and make sure they are not all zero
550 for (int j = 0; j < thislookup.size(); j++) {
551 if (thislookup[j]->getAbundance(i) != 0) { allZero = false; break; }
554 //if they are not all zero add this bin
556 for (int j = 0; j < thislookup.size(); j++) {
557 newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
562 for (int j = 0; j < thislookup.size(); j++) { delete thislookup[j]; }
564 thislookup = newLookup;
569 catch(exception& e) {
570 m->errorOut(e, "NormalizeSharedCommand", "eliminateZeroOTUS");
574 //**********************************************************************************************************************
575 int NormalizeSharedCommand::eliminateZeroOTUS(vector<SharedRAbundFloatVector*>& thislookup) {
578 vector<SharedRAbundFloatVector*> newLookup;
579 for (int i = 0; i < thislookup.size(); i++) {
580 SharedRAbundFloatVector* temp = new SharedRAbundFloatVector();
581 temp->setLabel(thislookup[i]->getLabel());
582 temp->setGroup(thislookup[i]->getGroup());
583 newLookup.push_back(temp);
587 for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
588 if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
590 //look at each sharedRabund and make sure they are not all zero
592 for (int j = 0; j < thislookup.size(); j++) {
593 if (thislookup[j]->getAbundance(i) != 0) { allZero = false; break; }
596 //if they are not all zero add this bin
598 for (int j = 0; j < thislookup.size(); j++) {
599 newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
604 for (int j = 0; j < thislookup.size(); j++) { delete thislookup[j]; }
606 thislookup = newLookup;
611 catch(exception& e) {
612 m->errorOut(e, "NormalizeSharedCommand", "eliminateZeroOTUS");
617 //**********************************************************************************************************************