2 * normalizesharedcommand.cpp
5 * Created by westcott on 9/15/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "normalizesharedcommand.h"
12 //**********************************************************************************************************************
13 vector<string> NormalizeSharedCommand::getValidParameters(){
15 string Array[] = {"groups","label","method","makerelabund","outputdir","inputdir","norm"};
16 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
20 m->errorOut(e, "NormalizeSharedCommand", "getValidParameters");
24 //**********************************************************************************************************************
25 NormalizeSharedCommand::NormalizeSharedCommand(){
27 abort = true; calledHelp = true;
28 vector<string> tempOutNames;
29 outputTypes["shared"] = tempOutNames;
32 m->errorOut(e, "NormalizeSharedCommand", "NormalizeSharedCommand");
36 //**********************************************************************************************************************
37 vector<string> NormalizeSharedCommand::getRequiredParameters(){
39 vector<string> myArray;
43 m->errorOut(e, "NormalizeSharedCommand", "getRequiredParameters");
47 //**********************************************************************************************************************
48 vector<string> NormalizeSharedCommand::getRequiredFiles(){
50 string Array[] = {"shared"};
51 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
55 m->errorOut(e, "NormalizeSharedCommand", "getRequiredFiles");
59 //**********************************************************************************************************************
61 NormalizeSharedCommand::NormalizeSharedCommand(string option) {
63 globaldata = GlobalData::getInstance();
64 abort = false; calledHelp = false;
68 //allow user to run help
69 if(option == "help") { help(); abort = true; calledHelp = true; }
72 //valid paramters for this command
73 string AlignArray[] = {"groups","label","method","makerelabund","outputdir","inputdir","norm"};
74 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
76 OptionParser parser(option);
77 map<string,string> parameters = parser.getParameters();
79 ValidParameters validParameter;
81 //check to make sure all parameters are valid for command
82 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
83 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
86 //initialize outputTypes
87 vector<string> tempOutNames;
88 outputTypes["shared"] = tempOutNames;
90 //if the user changes the output directory command factory will send this info to us in the output parameter
91 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
93 outputDir += m->hasPath(globaldata->inputFileName); //if user entered a file with a path then preserve it
96 //make sure the user has already run the read.otu command
97 if ((globaldata->getSharedFile() == "") && (globaldata->getRelAbundFile() == "")) {
98 m->mothurOut("You must read a list and a group, shared or relabund file before you can use the normalize.shared command."); m->mothurOutEndLine(); abort = true;
101 if ((globaldata->getSharedFile() != "") && (globaldata->getRelAbundFile() != "")) {
102 m->mothurOut("You may not use both a shared and relabund file as input for normalize.shared command."); m->mothurOutEndLine(); abort = true;
106 //check for optional parameter and set defaults
107 // ...at some point should added some additional type checking...
108 label = validParameter.validFile(parameters, "label", false);
109 if (label == "not found") { label = ""; }
111 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
112 else { allLines = 1; }
115 //if the user has not specified any labels use the ones from read.otu
117 allLines = globaldata->allLines;
118 labels = globaldata->labels;
121 groups = validParameter.validFile(parameters, "groups", false);
122 if (groups == "not found") { groups = ""; pickedGroups = false; }
125 m->splitAtDash(groups, Groups);
126 globaldata->Groups = Groups;
129 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "totalgroup"; }
130 if ((method != "totalgroup") && (method != "zscore")) { m->mothurOut(method + " is not a valid scaling option for the normalize.shared command. The options are totalgroup and zscore. We hope to add more ways to normalize in the future, suggestions are welcome!"); m->mothurOutEndLine(); abort = true; }
132 string temp = validParameter.validFile(parameters, "norm", false);
133 if (temp == "not found") {
134 norm = 0; //once you have read, set norm to smallest group number
137 if (norm < 0) { m->mothurOut("norm must be positive."); m->mothurOutEndLine(); abort=true; }
140 temp = validParameter.validFile(parameters, "makerelabund", false); if (temp == "") { temp = "f"; }
141 makeRelabund = m->isTrue(temp);
143 if ((globaldata->getFormat() != "sharedfile") && makeRelabund) { m->mothurOut("makerelabund can only be used with a shared file."); m->mothurOutEndLine(); }
148 catch(exception& e) {
149 m->errorOut(e, "NormalizeSharedCommand", "NormalizeSharedCommand");
154 //**********************************************************************************************************************
156 void NormalizeSharedCommand::help(){
158 m->mothurOut("The normalize.shared command can only be executed after a successful read.otu command of a list and group, shared or relabund file.\n");
159 m->mothurOut("The normalize.shared command parameters are groups, method, norm, makerelabund and label. No parameters are required.\n");
160 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n");
161 m->mothurOut("The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n");
162 m->mothurOut("The method parameter allows you to select what method you would like to use to normalize. The options are totalgroup and zscore. We hope to add more ways to normalize in the future, suggestions are welcome!\n");
163 m->mothurOut("The makerelabund parameter allows you to convert a shared file to a relabund file before you normalize. default=f.\n");
164 m->mothurOut("The norm parameter allows you to number you would like to normalize to. By default this is set to the number of sequences in your smallest group.\n");
165 m->mothurOut("The normalize.shared command should be in the following format: normalize.shared(groups=yourGroups, label=yourLabels).\n");
166 m->mothurOut("Example normalize.shared(groups=A-B-C, scale=totalgroup).\n");
167 m->mothurOut("The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n");
168 m->mothurOut("The normalize.shared command outputs a .norm.shared file.\n");
169 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
172 catch(exception& e) {
173 m->errorOut(e, "NormalizeSharedCommand", "help");
178 //**********************************************************************************************************************
180 NormalizeSharedCommand::~NormalizeSharedCommand(){}
182 //**********************************************************************************************************************
184 int NormalizeSharedCommand::execute(){
187 if (abort == true) { if (calledHelp) { return 0; } return 2; }
189 string outputFileName = outputDir + m->getRootName(m->getSimpleName(globaldata->inputFileName)) + "norm.shared";
191 m->openOutputFile(outputFileName, out);
193 if (globaldata->getFormat() == "sharedfile") { input = new InputData(globaldata->inputFileName, "sharedfile"); }
194 else { input = new InputData(globaldata->inputFileName, "relabund"); }
196 //you are reading a sharedfile and you do not want to make relabund
197 if ((globaldata->getFormat() == "sharedfile") && (!makeRelabund)) {
198 lookup = input->getSharedRAbundVectors();
199 string lastLabel = lookup[0]->getLabel();
201 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
202 set<string> processedLabels;
203 set<string> userLabels = labels;
205 if (method == "totalgroup") {
206 //set norm to smallest group number
208 norm = lookup[0]->getNumSeqs();
209 for (int i = 1; i < lookup.size(); i++) {
210 if (lookup[i]->getNumSeqs() < norm) { norm = lookup[i]->getNumSeqs(); }
214 m->mothurOut("Normalizing to " + toString(norm) + "."); m->mothurOutEndLine();
217 //as long as you are not at the end of the file or done wih the lines you want
218 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
220 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } globaldata->Groups.clear(); out.close(); remove(outputFileName.c_str()); return 0; }
222 if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
224 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
225 normalize(lookup, out);
227 processedLabels.insert(lookup[0]->getLabel());
228 userLabels.erase(lookup[0]->getLabel());
231 if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
232 string saveLabel = lookup[0]->getLabel();
234 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
235 lookup = input->getSharedRAbundVectors(lastLabel);
236 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
238 normalize(lookup, out);
240 processedLabels.insert(lookup[0]->getLabel());
241 userLabels.erase(lookup[0]->getLabel());
243 //restore real lastlabel to save below
244 lookup[0]->setLabel(saveLabel);
247 lastLabel = lookup[0]->getLabel();
248 //prevent memory leak
249 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; }
251 if (m->control_pressed) { outputTypes.clear(); globaldata->Groups.clear(); out.close(); remove(outputFileName.c_str()); return 0; }
253 //get next line to process
254 lookup = input->getSharedRAbundVectors();
257 if (m->control_pressed) { outputTypes.clear(); globaldata->Groups.clear(); out.close(); remove(outputFileName.c_str()); return 0; }
259 //output error messages about any remaining user labels
260 set<string>::iterator it;
261 bool needToRun = false;
262 for (it = userLabels.begin(); it != userLabels.end(); it++) {
263 m->mothurOut("Your file does not include the label " + *it);
264 if (processedLabels.count(lastLabel) != 1) {
265 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
268 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
272 //run last label if you need to
273 if (needToRun == true) {
274 for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
275 lookup = input->getSharedRAbundVectors(lastLabel);
277 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
279 normalize(lookup, out);
281 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
284 }else{ //relabund values
285 lookupFloat = input->getSharedRAbundFloatVectors();
286 string lastLabel = lookupFloat[0]->getLabel();
288 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
289 set<string> processedLabels;
290 set<string> userLabels = labels;
292 //set norm to smallest group number
293 if (method == "totalgroup") {
295 norm = lookupFloat[0]->getNumSeqs();
296 for (int i = 1; i < lookupFloat.size(); i++) {
297 if (lookupFloat[i]->getNumSeqs() < norm) { norm = lookupFloat[i]->getNumSeqs(); }
301 m->mothurOut("Normalizing to " + toString(norm) + "."); m->mothurOutEndLine();
304 //as long as you are not at the end of the file or done wih the lines you want
305 while((lookupFloat[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
307 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; } globaldata->Groups.clear(); out.close(); remove(outputFileName.c_str()); return 0; }
309 if(allLines == 1 || labels.count(lookupFloat[0]->getLabel()) == 1){
311 m->mothurOut(lookupFloat[0]->getLabel()); m->mothurOutEndLine();
312 normalize(lookupFloat, out);
314 processedLabels.insert(lookupFloat[0]->getLabel());
315 userLabels.erase(lookupFloat[0]->getLabel());
318 if ((m->anyLabelsToProcess(lookupFloat[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
319 string saveLabel = lookupFloat[0]->getLabel();
321 for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; }
322 lookupFloat = input->getSharedRAbundFloatVectors(lastLabel);
323 m->mothurOut(lookupFloat[0]->getLabel()); m->mothurOutEndLine();
325 normalize(lookupFloat, out);
327 processedLabels.insert(lookupFloat[0]->getLabel());
328 userLabels.erase(lookupFloat[0]->getLabel());
330 //restore real lastlabel to save below
331 lookupFloat[0]->setLabel(saveLabel);
334 lastLabel = lookupFloat[0]->getLabel();
335 //prevent memory leak
336 for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; lookupFloat[i] = NULL; }
338 if (m->control_pressed) { outputTypes.clear(); globaldata->Groups.clear(); out.close(); remove(outputFileName.c_str()); return 0; }
340 //get next line to process
341 lookupFloat = input->getSharedRAbundFloatVectors();
344 if (m->control_pressed) { outputTypes.clear(); globaldata->Groups.clear(); out.close(); remove(outputFileName.c_str()); return 0; }
346 //output error messages about any remaining user labels
347 set<string>::iterator it;
348 bool needToRun = false;
349 for (it = userLabels.begin(); it != userLabels.end(); it++) {
350 m->mothurOut("Your file does not include the label " + *it);
351 if (processedLabels.count(lastLabel) != 1) {
352 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
355 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
359 //run last label if you need to
360 if (needToRun == true) {
361 for (int i = 0; i < lookupFloat.size(); i++) { if (lookupFloat[i] != NULL) { delete lookupFloat[i]; } }
362 lookupFloat = input->getSharedRAbundFloatVectors(lastLabel);
364 m->mothurOut(lookupFloat[0]->getLabel()); m->mothurOutEndLine();
366 normalize(lookupFloat, out);
368 for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; }
372 //reset groups parameter
373 globaldata->Groups.clear();
377 if (m->control_pressed) { outputTypes.clear(); remove(outputFileName.c_str()); return 0;}
379 m->mothurOutEndLine();
380 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
381 m->mothurOut(outputFileName); m->mothurOutEndLine(); outputNames.push_back(outputFileName); outputTypes["shared"].push_back(outputFileName);
382 m->mothurOutEndLine();
384 //set shared file as new current sharedfile
386 itTypes = outputTypes.find("shared");
387 if (itTypes != outputTypes.end()) {
388 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSharedFile(current); }
393 catch(exception& e) {
394 m->errorOut(e, "NormalizeSharedCommand", "execute");
398 //**********************************************************************************************************************
400 int NormalizeSharedCommand::normalize(vector<SharedRAbundVector*>& thisLookUp, ofstream& out){
402 if (pickedGroups) { eliminateZeroOTUS(thisLookUp); }
404 if (method == "totalgroup") {
406 for (int j = 0; j < thisLookUp[0]->getNumBins(); j++) {
408 for (int i = 0; i < thisLookUp.size(); i++) {
410 if (m->control_pressed) { return 0; }
412 int abund = thisLookUp[i]->getAbundance(j);
414 float relabund = abund / (float) thisLookUp[i]->getNumSeqs();
415 float newNorm = relabund * norm;
417 //round to nearest int
418 int finalNorm = (int) floor((newNorm + 0.5));
420 thisLookUp[i]->set(j, finalNorm, thisLookUp[i]->getGroup());
424 }else if (method == "zscore") {
426 for (int j = 0; j < thisLookUp[0]->getNumBins(); j++) {
428 if (m->control_pressed) { return 0; }
432 for (int i = 0; i < thisLookUp.size(); i++) { mean += thisLookUp[i]->getAbundance(j); }
433 mean /= (float) thisLookUp.size();
435 //calc standard deviation
436 float sumSquared = 0.0;
437 for (int i = 0; i < thisLookUp.size(); i++) { sumSquared += (((float)thisLookUp[i]->getAbundance(j) - mean) * ((float)thisLookUp[i]->getAbundance(j) - mean)); }
438 sumSquared /= (float) thisLookUp.size();
440 float standardDev = sqrt(sumSquared);
442 for (int i = 0; i < thisLookUp.size(); i++) {
444 if (standardDev != 0) { // stop divide by zero
445 float newNorm = ((float)thisLookUp[i]->getAbundance(j) - mean) / standardDev;
446 //round to nearest int
447 finalNorm = (int) floor((newNorm + 0.5));
450 thisLookUp[i]->set(j, finalNorm, thisLookUp[i]->getGroup());
454 }else{ m->mothurOut(method + " is not a valid scaling option."); m->mothurOutEndLine(); m->control_pressed = true; return 0; }
458 eliminateZeroOTUS(thisLookUp);
460 for (int i = 0; i < thisLookUp.size(); i++) {
461 out << thisLookUp[i]->getLabel() << '\t' << thisLookUp[i]->getGroup() << '\t';
462 thisLookUp[i]->print(out);
467 catch(exception& e) {
468 m->errorOut(e, "NormalizeSharedCommand", "normalize");
472 //**********************************************************************************************************************
474 int NormalizeSharedCommand::normalize(vector<SharedRAbundFloatVector*>& thisLookUp, ofstream& out){
476 if (pickedGroups) { eliminateZeroOTUS(thisLookUp); }
478 if (method == "totalgroup") {
480 for (int j = 0; j < thisLookUp[0]->getNumBins(); j++) {
482 for (int i = 0; i < thisLookUp.size(); i++) {
484 if (m->control_pressed) { return 0; }
486 float abund = thisLookUp[i]->getAbundance(j);
488 float relabund = abund / (float) thisLookUp[i]->getNumSeqs();
489 float newNorm = relabund * norm;
491 thisLookUp[i]->set(j, newNorm, thisLookUp[i]->getGroup());
495 }else if (method == "zscore") {
496 for (int j = 0; j < thisLookUp[0]->getNumBins(); j++) {
498 if (m->control_pressed) { return 0; }
502 for (int i = 0; i < thisLookUp.size(); i++) { mean += thisLookUp[i]->getAbundance(j); }
503 mean /= (float) thisLookUp.size();
505 //calc standard deviation
506 float sumSquared = 0.0;
507 for (int i = 0; i < thisLookUp.size(); i++) { sumSquared += ((thisLookUp[i]->getAbundance(j) - mean) * (thisLookUp[i]->getAbundance(j) - mean)); }
508 sumSquared /= (float) thisLookUp.size();
510 float standardDev = sqrt(sumSquared);
512 for (int i = 0; i < thisLookUp.size(); i++) {
514 if (standardDev != 0) { // stop divide by zero
515 newNorm = (thisLookUp[i]->getAbundance(j) - mean) / standardDev;
517 thisLookUp[i]->set(j, newNorm, thisLookUp[i]->getGroup());
521 }else{ m->mothurOut(method + " is not a valid scaling option."); m->mothurOutEndLine(); m->control_pressed = true; return 0; }
524 eliminateZeroOTUS(thisLookUp);
526 for (int i = 0; i < thisLookUp.size(); i++) {
527 out << thisLookUp[i]->getLabel() << '\t' << thisLookUp[i]->getGroup() << '\t';
528 thisLookUp[i]->print(out);
533 catch(exception& e) {
534 m->errorOut(e, "NormalizeSharedCommand", "normalize");
538 //**********************************************************************************************************************
539 int NormalizeSharedCommand::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
542 vector<SharedRAbundVector*> newLookup;
543 for (int i = 0; i < thislookup.size(); i++) {
544 SharedRAbundVector* temp = new SharedRAbundVector();
545 temp->setLabel(thislookup[i]->getLabel());
546 temp->setGroup(thislookup[i]->getGroup());
547 newLookup.push_back(temp);
551 for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
552 if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
554 //look at each sharedRabund and make sure they are not all zero
556 for (int j = 0; j < thislookup.size(); j++) {
557 if (thislookup[j]->getAbundance(i) != 0) { allZero = false; break; }
560 //if they are not all zero add this bin
562 for (int j = 0; j < thislookup.size(); j++) {
563 newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
568 for (int j = 0; j < thislookup.size(); j++) { delete thislookup[j]; }
570 thislookup = newLookup;
575 catch(exception& e) {
576 m->errorOut(e, "NormalizeSharedCommand", "eliminateZeroOTUS");
580 //**********************************************************************************************************************
581 int NormalizeSharedCommand::eliminateZeroOTUS(vector<SharedRAbundFloatVector*>& thislookup) {
584 vector<SharedRAbundFloatVector*> newLookup;
585 for (int i = 0; i < thislookup.size(); i++) {
586 SharedRAbundFloatVector* temp = new SharedRAbundFloatVector();
587 temp->setLabel(thislookup[i]->getLabel());
588 temp->setGroup(thislookup[i]->getGroup());
589 newLookup.push_back(temp);
593 for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
594 if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
596 //look at each sharedRabund and make sure they are not all zero
598 for (int j = 0; j < thislookup.size(); j++) {
599 if (thislookup[j]->getAbundance(i) != 0) { allZero = false; break; }
602 //if they are not all zero add this bin
604 for (int j = 0; j < thislookup.size(); j++) {
605 newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
610 for (int j = 0; j < thislookup.size(); j++) { delete thislookup[j]; }
612 thislookup = newLookup;
617 catch(exception& e) {
618 m->errorOut(e, "NormalizeSharedCommand", "eliminateZeroOTUS");
623 //**********************************************************************************************************************