5 * Created by westcott on 10/27/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "subsamplecommand.h"
12 //**********************************************************************************************************************
13 vector<string> SubSampleCommand::getValidParameters(){
15 string Array[] = {"fasta", "group", "list","shared","rabund", "name","sabund","size","groups","label","outputdir","inputdir"};
16 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
20 m->errorOut(e, "SubSampleCommand", "getValidParameters");
24 //**********************************************************************************************************************
25 SubSampleCommand::SubSampleCommand(){
28 //initialize outputTypes
29 vector<string> tempOutNames;
30 outputTypes["shared"] = tempOutNames;
31 outputTypes["list"] = tempOutNames;
32 outputTypes["rabund"] = tempOutNames;
33 outputTypes["sabund"] = tempOutNames;
34 outputTypes["fasta"] = tempOutNames;
35 outputTypes["name"] = tempOutNames;
36 outputTypes["group"] = tempOutNames;
39 m->errorOut(e, "SubSampleCommand", "GetRelAbundCommand");
43 //**********************************************************************************************************************
44 vector<string> SubSampleCommand::getRequiredParameters(){
46 string Array[] = {"fasta","list","shared","rabund", "sabund","or"};
47 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
51 m->errorOut(e, "SubSampleCommand", "getRequiredParameters");
55 //**********************************************************************************************************************
56 vector<string> SubSampleCommand::getRequiredFiles(){
58 vector<string> myArray;
62 m->errorOut(e, "SubSampleCommand", "getRequiredFiles");
66 //**********************************************************************************************************************
67 SubSampleCommand::SubSampleCommand(string option) {
69 globaldata = GlobalData::getInstance();
74 //allow user to run help
75 if(option == "help") { help(); abort = true; }
78 //valid paramters for this command
79 string Array[] = {"fasta", "group", "list","shared","rabund", "sabund","name","size","groups","label","outputdir","inputdir"};
80 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
82 OptionParser parser(option);
83 map<string,string> parameters = parser.getParameters();
85 ValidParameters validParameter;
87 //check to make sure all parameters are valid for command
88 map<string,string>::iterator it;
89 for (it = parameters.begin(); it != parameters.end(); it++) {
90 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
93 //initialize outputTypes
94 vector<string> tempOutNames;
95 outputTypes["shared"] = tempOutNames;
96 outputTypes["list"] = tempOutNames;
97 outputTypes["rabund"] = tempOutNames;
98 outputTypes["sabund"] = tempOutNames;
99 outputTypes["fasta"] = tempOutNames;
100 outputTypes["name"] = tempOutNames;
101 outputTypes["group"] = tempOutNames;
103 //if the user changes the output directory command factory will send this info to us in the output parameter
104 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
106 //if the user changes the input directory command factory will send this info to us in the output parameter
107 string inputDir = validParameter.validFile(parameters, "inputdir", false);
108 if (inputDir == "not found"){ inputDir = ""; }
111 it = parameters.find("list");
112 //user has given a template file
113 if(it != parameters.end()){
114 path = m->hasPath(it->second);
115 //if the user has not given a path then, add inputdir. else leave path alone.
116 if (path == "") { parameters["list"] = inputDir + it->second; }
119 it = parameters.find("fasta");
120 //user has given a template file
121 if(it != parameters.end()){
122 path = m->hasPath(it->second);
123 //if the user has not given a path then, add inputdir. else leave path alone.
124 if (path == "") { parameters["fasta"] = inputDir + it->second; }
127 it = parameters.find("shared");
128 //user has given a template file
129 if(it != parameters.end()){
130 path = m->hasPath(it->second);
131 //if the user has not given a path then, add inputdir. else leave path alone.
132 if (path == "") { parameters["shared"] = inputDir + it->second; }
135 it = parameters.find("group");
136 //user has given a template file
137 if(it != parameters.end()){
138 path = m->hasPath(it->second);
139 //if the user has not given a path then, add inputdir. else leave path alone.
140 if (path == "") { parameters["group"] = inputDir + it->second; }
143 it = parameters.find("sabund");
144 //user has given a template file
145 if(it != parameters.end()){
146 path = m->hasPath(it->second);
147 //if the user has not given a path then, add inputdir. else leave path alone.
148 if (path == "") { parameters["sabund"] = inputDir + it->second; }
151 it = parameters.find("rabund");
152 //user has given a template file
153 if(it != parameters.end()){
154 path = m->hasPath(it->second);
155 //if the user has not given a path then, add inputdir. else leave path alone.
156 if (path == "") { parameters["rabund"] = inputDir + it->second; }
159 it = parameters.find("name");
160 //user has given a template file
161 if(it != parameters.end()){
162 path = m->hasPath(it->second);
163 //if the user has not given a path then, add inputdir. else leave path alone.
164 if (path == "") { parameters["name"] = inputDir + it->second; }
168 //check for required parameters
169 listfile = validParameter.validFile(parameters, "list", true);
170 if (listfile == "not open") { listfile = ""; abort = true; }
171 else if (listfile == "not found") { listfile = ""; }
173 sabundfile = validParameter.validFile(parameters, "sabund", true);
174 if (sabundfile == "not open") { sabundfile = ""; abort = true; }
175 else if (sabundfile == "not found") { sabundfile = ""; }
177 rabundfile = validParameter.validFile(parameters, "rabund", true);
178 if (rabundfile == "not open") { rabundfile = ""; abort = true; }
179 else if (rabundfile == "not found") { rabundfile = ""; }
181 fastafile = validParameter.validFile(parameters, "fasta", true);
182 if (fastafile == "not open") { fastafile = ""; abort = true; }
183 else if (fastafile == "not found") { fastafile = ""; }
185 sharedfile = validParameter.validFile(parameters, "shared", true);
186 if (sharedfile == "not open") { sharedfile = ""; abort = true; }
187 else if (sharedfile == "not found") { sharedfile = ""; }
189 namefile = validParameter.validFile(parameters, "name", true);
190 if (namefile == "not open") { namefile = ""; abort = true; }
191 else if (namefile == "not found") { namefile = ""; }
193 groupfile = validParameter.validFile(parameters, "group", true);
194 if (groupfile == "not open") { groupfile = ""; abort = true; }
195 else if (groupfile == "not found") { groupfile = ""; }
198 //check for optional parameter and set defaults
199 // ...at some point should added some additional type checking...
200 label = validParameter.validFile(parameters, "label", false);
201 if (label == "not found") { label = ""; }
203 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
204 else { allLines = 1; }
207 groups = validParameter.validFile(parameters, "groups", false);
208 if (groups == "not found") { groups = ""; pickedGroups = false; }
211 m->splitAtDash(groups, Groups);
212 globaldata->Groups = Groups;
215 string temp = validParameter.validFile(parameters, "size", false); if (temp == "not found"){ temp = "0"; }
218 if ((namefile != "") && (fastafile == "")) { m->mothurOut("You may only use a namefile with a fastafile."); m->mothurOutEndLine(); abort = true; }
220 if ((fastafile == "") && (listfile == "") && (sabundfile == "") && (rabundfile == "") && (sharedfile == "")) {
221 m->mothurOut("You must provide a fasta, list, sabund, rabund or shared file as an input file."); m->mothurOutEndLine(); abort = true; }
223 if (pickedGroups && ((groupfile == "") && (sharedfile == ""))) {
224 m->mothurOut("You cannot pick groups without a valid group file or shared file."); m->mothurOutEndLine(); abort = true; }
226 if ((groupfile != "") && ((fastafile == "") && (listfile == ""))) {
227 m->mothurOut("Group file only valid with listfile or fastafile."); m->mothurOutEndLine(); abort = true; }
232 catch(exception& e) {
233 m->errorOut(e, "SubSampleCommand", "SubSampleCommand");
238 //**********************************************************************************************************************
240 void SubSampleCommand::help(){
242 m->mothurOut("The sub.sample command is designed to be used as a way to normalize your data, or create a smaller set from your original set.\n");
243 m->mothurOut("The sub.sample command parameters are fasta, name, list, group, rabund, sabund, shared, groups, size and label. You must provide a fasta, list, sabund, rabund or shared file as an input file.\n");
244 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n");
245 m->mothurOut("The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n");
246 m->mothurOut("The size parameter allows you indicate the size of your subsample.\n");
247 m->mothurOut("The size parameter is not set: with shared file size=number of seqs in smallest sample, with all other files, 10% of number of seqs.\n");
248 m->mothurOut("The sub.sample command should be in the following format: sub.sample(list=yourListFile, group=yourGroupFile, groups=yourGroups, label=yourLabels).\n");
249 m->mothurOut("Example sub.sample(list=abrecovery.fn.list, group=abrecovery.groups, groups=B-C, size=20).\n");
250 m->mothurOut("The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n");
251 m->mothurOut("The sub.sample command outputs a .subsample file.\n");
252 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
255 catch(exception& e) {
256 m->errorOut(e, "SubSampleCommand", "help");
261 //**********************************************************************************************************************
263 SubSampleCommand::~SubSampleCommand(){}
265 //**********************************************************************************************************************
267 int SubSampleCommand::execute(){
270 if (abort == true) { return 0; }
272 if (sharedfile != "") { getSubSampleShared(); }
273 //if (listfile != "") { getSubSampleList(); }
274 //if (rabund != "") { getSubSampleRabund(); }
275 //if (sabundfile != "") { getSubSampleSabund(); }
276 //if (fastafile != "") { getSubSampleFasta(); }
278 m->mothurOutEndLine();
279 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
280 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
281 m->mothurOutEndLine();
285 catch(exception& e) {
286 m->errorOut(e, "SubSampleCommand", "execute");
290 //**********************************************************************************************************************
291 int SubSampleCommand::getSubSampleShared() {
294 string thisOutputDir = outputDir;
295 if (outputDir == "") { thisOutputDir += m->hasPath(sharedfile); }
296 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + "subsample" + m->getExtension(sharedfile);
299 m->openOutputFile(outputFileName, out);
300 outputTypes["shared"].push_back(outputFileName); outputNames.push_back(outputFileName);
302 InputData* input = new InputData(sharedfile, "sharedfile");
303 vector<SharedRAbundVector*> lookup = input->getSharedRAbundVectors();
304 string lastLabel = lookup[0]->getLabel();
306 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
307 set<string> processedLabels;
308 set<string> userLabels = labels;
311 //as long as you are not at the end of the file or done wih the lines you want
312 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
313 if (m->control_pressed) { out.close(); return 0; }
315 if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
317 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
319 processShared(lookup, out);
321 processedLabels.insert(lookup[0]->getLabel());
322 userLabels.erase(lookup[0]->getLabel());
325 if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
326 string saveLabel = lookup[0]->getLabel();
328 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
330 lookup = input->getSharedRAbundVectors(lastLabel);
331 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
333 processShared(lookup, out);
335 processedLabels.insert(lookup[0]->getLabel());
336 userLabels.erase(lookup[0]->getLabel());
338 //restore real lastlabel to save below
339 lookup[0]->setLabel(saveLabel);
342 lastLabel = lookup[0]->getLabel();
343 //prevent memory leak
344 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; }
346 //get next line to process
347 lookup = input->getSharedRAbundVectors();
351 if (m->control_pressed) { out.close(); return 0; }
353 //output error messages about any remaining user labels
354 set<string>::iterator it;
355 bool needToRun = false;
356 for (it = userLabels.begin(); it != userLabels.end(); it++) {
357 m->mothurOut("Your file does not include the label " + *it);
358 if (processedLabels.count(lastLabel) != 1) {
359 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
362 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
366 //run last label if you need to
367 if (needToRun == true) {
368 for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
369 lookup = input->getSharedRAbundVectors(lastLabel);
371 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
373 processShared(lookup, out);
375 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
383 catch(exception& e) {
384 m->errorOut(e, "SubSampleCommand", "getSubSampleShared");
388 //**********************************************************************************************************************
389 int SubSampleCommand::processShared(vector<SharedRAbundVector*>& thislookup, ofstream& out) {
392 if (pickedGroups) { eliminateZeroOTUS(thislookup); }
394 if (size == 0) { //user has not set size, set size = smallest samples size
395 size = thislookup[0]->getNumSeqs();
396 for (int i = 1; i < thislookup.size(); i++) {
397 int thisSize = thislookup[i]->getNumSeqs();
399 if (thisSize < size) { size = thisSize; }
403 int numBins = thislookup[0]->getNumBins();
404 for (int i = 0; i < thislookup.size(); i++) {
405 int thisSize = thislookup[i]->getNumSeqs();
407 if (thisSize != size) {
409 string thisgroup = thislookup[i]->getGroup();
411 OrderVector* order = new OrderVector();
412 for(int p=0;p<numBins;p++){
413 for(int j=0;j<thislookup[i]->getAbundance(p);j++){
417 random_shuffle(order->begin(), order->end());
419 SharedRAbundVector* temp = new SharedRAbundVector(numBins);
420 temp->setLabel(thislookup[i]->getLabel());
421 temp->setGroup(thislookup[i]->getGroup());
423 delete thislookup[i];
424 thislookup[i] = temp;
427 for (int j = 0; j < size; j++) {
428 //get random number to sample from order between 0 and thisSize-1.
429 int myrand = (int)((float)(rand()) / (RAND_MAX / (thisSize-1) + 1));
431 int bin = order->get(myrand);
433 int abund = thislookup[i]->getAbundance(bin);
434 thislookup[i]->set(bin, (abund+1), thisgroup);
440 //subsampling may have created some otus with no sequences in them
441 eliminateZeroOTUS(thislookup);
443 for (int i = 0; i < thislookup.size(); i++) {
444 out << thislookup[i]->getLabel() << '\t' << thislookup[i]->getGroup() << '\t';
445 thislookup[i]->print(out);
451 catch(exception& e) {
452 m->errorOut(e, "SubSampleCommand", "eliminateZeroOTUS");
456 //**********************************************************************************************************************
457 int SubSampleCommand::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
460 vector<SharedRAbundVector*> newLookup;
461 for (int i = 0; i < thislookup.size(); i++) {
462 SharedRAbundVector* temp = new SharedRAbundVector();
463 temp->setLabel(thislookup[i]->getLabel());
464 temp->setGroup(thislookup[i]->getGroup());
465 newLookup.push_back(temp);
469 for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
470 if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
472 //look at each sharedRabund and make sure they are not all zero
474 for (int j = 0; j < thislookup.size(); j++) {
475 if (thislookup[j]->getAbundance(i) != 0) { allZero = false; break; }
478 //if they are not all zero add this bin
480 for (int j = 0; j < thislookup.size(); j++) {
481 newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
486 for (int j = 0; j < thislookup.size(); j++) { delete thislookup[j]; }
489 thislookup = newLookup;
494 catch(exception& e) {
495 m->errorOut(e, "SubSampleCommand", "eliminateZeroOTUS");
500 //**********************************************************************************************************************