X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=subsamplecommand.cpp;h=66ea7e91df86eb1bc5b2b55c23b08dc852fa135a;hb=e150b0b0664caec517485ee6d69dcdade6dcae77;hp=223c9f5edf68c6eaa23f668e48ac0917854f6067;hpb=e88ba6b7a994a8502030d38cc5cc542994694d4d;p=mothur.git diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp index 223c9f5..66ea7e9 100644 --- a/subsamplecommand.cpp +++ b/subsamplecommand.cpp @@ -11,14 +11,53 @@ #include "sharedutilities.h" //********************************************************************************************************************** -vector SubSampleCommand::getValidParameters(){ - try { - string Array[] = {"fasta", "group", "list","shared","rabund","persample", "name","sabund","size","groups","label","outputdir","inputdir"}; - vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); +vector SubSampleCommand::setParameters(){ + try { + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FLSSR", "none",false,false); parameters.push_back(pfasta); + CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter plist("list", "InputTypes", "", "", "none", "FLSSR", "none",false,false); parameters.push_back(plist); + CommandParameter pshared("shared", "InputTypes", "", "", "none", "FLSSR", "none",false,false); parameters.push_back(pshared); + CommandParameter prabund("rabund", "InputTypes", "", "", "none", "FLSSR", "none",false,false); parameters.push_back(prabund); + CommandParameter psabund("sabund", "InputTypes", "", "", "none", "FLSSR", "none",false,false); parameters.push_back(psabund); + CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); + CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); + CommandParameter psize("size", "Number", "", "0", "", "", "",false,false); parameters.push_back(psize); + CommandParameter ppersample("persample", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ppersample); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + + vector myArray; + for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } return myArray; } catch(exception& e) { - m->errorOut(e, "SubSampleCommand", "getValidParameters"); + m->errorOut(e, "SubSampleCommand", "setParameters"); + exit(1); + } +} +//********************************************************************************************************************** +string SubSampleCommand::getHelpString(){ + try { + string helpString = ""; + helpString += "The sub.sample command is designed to be used as a way to normalize your data, or create a smaller set from your original set.\n"; + helpString += "The sub.sample command parameters are fasta, name, list, group, rabund, sabund, shared, groups, size, persample and label. You must provide a fasta, list, sabund, rabund or shared file as an input file.\n"; + helpString += "The namefile is only used with the fasta file, not with the listfile, because the list file should contain all sequences.\n"; + helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n"; + helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n"; + helpString += "The size parameter allows you indicate the size of your subsample.\n"; + helpString += "The persample parameter allows you indicate you want to select subsample of the same size from each of your groups, default=false. It is only used with the list and fasta files if a groupfile is given.\n"; + helpString += "persample=false will select a random set of sequences of the size you select, but the number of seqs from each group may differ.\n"; + helpString += "The size parameter is not set: with shared file size=number of seqs in smallest sample, with all other files if a groupfile is given and persample=true, then size=number of seqs in smallest sample, otherwise size=10% of number of seqs.\n"; + helpString += "The sub.sample command should be in the following format: sub.sample(list=yourListFile, group=yourGroupFile, groups=yourGroups, label=yourLabels).\n"; + helpString += "Example sub.sample(list=abrecovery.fn.list, group=abrecovery.groups, groups=B-C, size=20).\n"; + helpString += "The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n"; + helpString += "The sub.sample command outputs a .subsample file.\n"; + helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n"; + return helpString; + } + catch(exception& e) { + m->errorOut(e, "SubSampleCommand", "getHelpString"); exit(1); } } @@ -26,6 +65,7 @@ vector SubSampleCommand::getValidParameters(){ SubSampleCommand::SubSampleCommand(){ try { abort = true; calledHelp = true; + setParameters(); vector tempOutNames; outputTypes["shared"] = tempOutNames; outputTypes["list"] = tempOutNames; @@ -41,43 +81,17 @@ SubSampleCommand::SubSampleCommand(){ } } //********************************************************************************************************************** -vector SubSampleCommand::getRequiredParameters(){ - try { - string Array[] = {"fasta","list","shared","rabund", "sabund","or"}; - vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); - return myArray; - } - catch(exception& e) { - m->errorOut(e, "SubSampleCommand", "getRequiredParameters"); - exit(1); - } -} -//********************************************************************************************************************** -vector SubSampleCommand::getRequiredFiles(){ - try { - vector myArray; - return myArray; - } - catch(exception& e) { - m->errorOut(e, "SubSampleCommand", "getRequiredFiles"); - exit(1); - } -} -//********************************************************************************************************************** SubSampleCommand::SubSampleCommand(string option) { try { - globaldata = GlobalData::getInstance(); abort = false; calledHelp = false; allLines = 1; - labels.clear(); //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} else { - //valid paramters for this command - string Array[] = {"fasta", "group", "list","shared","rabund","persample", "sabund","name","size","groups","label","outputdir","inputdir"}; - vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + vector myArray = setParameters(); OptionParser parser(option); map parameters = parser.getParameters(); @@ -209,7 +223,7 @@ SubSampleCommand::SubSampleCommand(string option) { else { pickedGroups = true; m->splitAtDash(groups, Groups); - globaldata->Groups = Groups; + m->Groups = Groups; } string temp = validParameter.validFile(parameters, "size", false); if (temp == "not found"){ temp = "0"; } @@ -242,37 +256,6 @@ SubSampleCommand::SubSampleCommand(string option) { exit(1); } } - -//********************************************************************************************************************** - -void SubSampleCommand::help(){ - try { - m->mothurOut("The sub.sample command is designed to be used as a way to normalize your data, or create a smaller set from your original set.\n"); - m->mothurOut("The sub.sample command parameters are fasta, name, list, group, rabund, sabund, shared, groups, size, persample and label. You must provide a fasta, list, sabund, rabund or shared file as an input file.\n"); - m->mothurOut("The namefile is only used with the fasta file, not with the listfile, because the list file should contain all sequences.\n"); - m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n"); - m->mothurOut("The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n"); - m->mothurOut("The size parameter allows you indicate the size of your subsample.\n"); - m->mothurOut("The persample parameter allows you indicate you want to select subsample of the same size from each of your groups, default=false. It is only used with the list and fasta files if a groupfile is given.\n"); - m->mothurOut("persample=false will select a random set of sequences of the size you select, but the number of seqs from each group may differ.\n"); - m->mothurOut("The size parameter is not set: with shared file size=number of seqs in smallest sample, with all other files if a groupfile is given and persample=true, then size=number of seqs in smallest sample, otherwise size=10% of number of seqs.\n"); - m->mothurOut("The sub.sample command should be in the following format: sub.sample(list=yourListFile, group=yourGroupFile, groups=yourGroups, label=yourLabels).\n"); - m->mothurOut("Example sub.sample(list=abrecovery.fn.list, group=abrecovery.groups, groups=B-C, size=20).\n"); - m->mothurOut("The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n"); - m->mothurOut("The sub.sample command outputs a .subsample file.\n"); - m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n"); - - } - catch(exception& e) { - m->errorOut(e, "SubSampleCommand", "help"); - exit(1); - } -} - -//********************************************************************************************************************** - -SubSampleCommand::~SubSampleCommand(){} - //********************************************************************************************************************** int SubSampleCommand::execute(){ @@ -295,7 +278,44 @@ int SubSampleCommand::execute(){ if (fastafile != "") { getSubSampleFasta(); } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); return 0; } } - + //set fasta file as new current fastafile + string current = ""; + itTypes = outputTypes.find("fasta"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); } + } + + itTypes = outputTypes.find("name"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } + } + + itTypes = outputTypes.find("group"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); } + } + + itTypes = outputTypes.find("list"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); } + } + + itTypes = outputTypes.find("shared"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSharedFile(current); } + } + + itTypes = outputTypes.find("rabund"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); } + } + + itTypes = outputTypes.find("sabund"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); } + } + + m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } @@ -337,13 +357,6 @@ int SubSampleCommand::getSubSampleFasta() { if (m->control_pressed) { return 0; } - string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "subsample" + m->getExtension(fastafile); - - ofstream out; - m->openOutputFile(outputFileName, out); - outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName); //make sure that if your picked groups size is not too big int thisSize = names.size(); @@ -356,13 +369,14 @@ int SubSampleCommand::getSubSampleFasta() { if (thisSize < size) { size = thisSize; } } }else { //make sure size is not too large - int smallestSize = groupMap->getNumSeqs(Groups[0]); - for (int i = 1; i < Groups.size(); i++) { + vector newGroups; + for (int i = 0; i < Groups.size(); i++) { int thisSize = groupMap->getNumSeqs(Groups[i]); - if (thisSize < smallestSize) { smallestSize = thisSize; } + if (thisSize >= size) { newGroups.push_back(Groups[i]); } + else { m->mothurOut("You have selected a size that is larger than " + Groups[i] + " number of sequences, removing " + Groups[i] + "."); m->mothurOutEndLine(); } } - if (smallestSize < size) { size = smallestSize; m->mothurOut("You have selected a size that is larger than your smallest sample, using your samllest sample size, " + toString(smallestSize) + "."); m->mothurOutEndLine(); } + Groups = newGroups; } m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine(); @@ -458,6 +472,17 @@ int SubSampleCommand::getSubSampleFasta() { } } } + + if (subset.size() == 0) { m->mothurOut("The size you selected is too large, skipping fasta file."); m->mothurOutEndLine(); return 0; } + + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "subsample" + m->getExtension(fastafile); + + ofstream out; + m->openOutputFile(outputFileName, out); + outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName); + //read through fasta file outputting only the names on the subsample list ifstream in; m->openInputFile(fastafile, in); @@ -625,14 +650,6 @@ int SubSampleCommand::readNames() { int SubSampleCommand::getSubSampleShared() { try { - string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(sharedfile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + "subsample" + m->getExtension(sharedfile); - - ofstream out; - m->openOutputFile(outputFileName, out); - outputTypes["shared"].push_back(outputFileName); outputNames.push_back(outputFileName); - InputData* input = new InputData(sharedfile, "sharedfile"); vector lookup = input->getSharedRAbundVectors(); string lastLabel = lookup[0]->getLabel(); @@ -648,9 +665,34 @@ int SubSampleCommand::getSubSampleShared() { if (thisSize < size) { size = thisSize; } } + }else { + m->Groups.clear(); + vector temp; + for (int i = 0; i < lookup.size(); i++) { + if (lookup[i]->getNumSeqs() < size) { + m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine(); + delete lookup[i]; + }else { + m->Groups.push_back(lookup[i]->getGroup()); + temp.push_back(lookup[i]); + } + } + lookup = temp; + Groups = m->Groups; } - m->mothurOut("Sampling " + toString(size) + " from " + toString(lookup[0]->getNumSeqs()) + "."); m->mothurOutEndLine(); + if (lookup.size() == 0) { m->mothurOut("The size you selected is too large, skipping shared file."); m->mothurOutEndLine(); delete input; return 0; } + + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(sharedfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + "subsample" + m->getExtension(sharedfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + outputTypes["shared"].push_back(outputFileName); outputNames.push_back(outputFileName); + + + m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine(); //as long as you are not at the end of the file or done wih the lines you want while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { @@ -855,13 +897,14 @@ int SubSampleCommand::getSubSampleList() { if (thisSize < size) { size = thisSize; } } }else { //make sure size is not too large - int smallestSize = groupMap->getNumSeqs(Groups[0]); - for (int i = 1; i < Groups.size(); i++) { + vector newGroups; + for (int i = 0; i < Groups.size(); i++) { int thisSize = groupMap->getNumSeqs(Groups[i]); - if (thisSize < smallestSize) { smallestSize = thisSize; } + if (thisSize >= size) { newGroups.push_back(Groups[i]); } + else { m->mothurOut("You have selected a size that is larger than " + Groups[i] + " number of sequences, removing " + Groups[i] + "."); m->mothurOutEndLine(); } } - if (smallestSize < size) { size = smallestSize; m->mothurOut("You have selected a size that is larger than your smallest sample, using your samllest sample size, " + toString(smallestSize) + "."); m->mothurOutEndLine(); } + Groups = newGroups; } m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine(); @@ -1104,7 +1147,7 @@ int SubSampleCommand::processList(ListVector*& list, ofstream& out, set& individual += binnames[j]; } } - if (subset.count(individual) != 0) { newNames += individual; } + if (subset.count(individual) != 0) { newNames += individual + ","; } //if there are names in this bin add to new list @@ -1132,15 +1175,6 @@ int SubSampleCommand::processList(ListVector*& list, ofstream& out, set& //********************************************************************************************************************** int SubSampleCommand::getSubSampleRabund() { try { - - string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(rabundfile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(rabundfile)) + "subsample" + m->getExtension(rabundfile); - - ofstream out; - m->openOutputFile(outputFileName, out); - outputTypes["rabund"].push_back(outputFileName); outputNames.push_back(outputFileName); - InputData* input = new InputData(rabundfile, "rabund"); RAbundVector* rabund = input->getRAbundVector(); string lastLabel = rabund->getLabel(); @@ -1151,10 +1185,18 @@ int SubSampleCommand::getSubSampleRabund() { if (size == 0) { //user has not set size, set size = 10% size = int((rabund->getNumSeqs()) * 0.10); - } + }else if (size > rabund->getNumSeqs()) { m->mothurOut("The size you selected is too large, skipping rabund file."); m->mothurOutEndLine(); delete input; delete rabund; return 0; } m->mothurOut("Sampling " + toString(size) + " from " + toString(rabund->getNumSeqs()) + "."); m->mothurOutEndLine(); + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(rabundfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(rabundfile)) + "subsample" + m->getExtension(rabundfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + outputTypes["rabund"].push_back(outputFileName); outputNames.push_back(outputFileName); + //as long as you are not at the end of the file or done wih the lines you want while((rabund != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { if (m->control_pressed) { delete input; delete rabund; out.close(); return 0; } @@ -1289,15 +1331,7 @@ int SubSampleCommand::processRabund(RAbundVector*& rabund, ofstream& out) { //********************************************************************************************************************** int SubSampleCommand::getSubSampleSabund() { try { - - string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(sabundfile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sabundfile)) + "subsample" + m->getExtension(sabundfile); - - ofstream out; - m->openOutputFile(outputFileName, out); - outputTypes["sabund"].push_back(outputFileName); outputNames.push_back(outputFileName); - + InputData* input = new InputData(sabundfile, "sabund"); SAbundVector* sabund = input->getSAbundVector(); string lastLabel = sabund->getLabel(); @@ -1308,10 +1342,20 @@ int SubSampleCommand::getSubSampleSabund() { if (size == 0) { //user has not set size, set size = 10% size = int((sabund->getNumSeqs()) * 0.10); - } + }else if (size > sabund->getNumSeqs()) { m->mothurOut("The size you selected is too large, skipping sabund file."); m->mothurOutEndLine(); delete input; delete sabund; return 0; } + m->mothurOut("Sampling " + toString(size) + " from " + toString(sabund->getNumSeqs()) + "."); m->mothurOutEndLine(); + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(sabundfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sabundfile)) + "subsample" + m->getExtension(sabundfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + outputTypes["sabund"].push_back(outputFileName); outputNames.push_back(outputFileName); + + //as long as you are not at the end of the file or done wih the lines you want while((sabund != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { if (m->control_pressed) { delete input; delete sabund; out.close(); return 0; }