]> git.donarmstrong.com Git - mothur.git/blob - subsamplecommand.cpp
fixed metastats, added resize to cluster.classic, added code to kill children if...
[mothur.git] / subsamplecommand.cpp
1 /*
2  *  subsamplecommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 10/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "subsamplecommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> SubSampleCommand::getValidParameters(){  
14         try {
15                 string Array[] =  {"fasta", "group", "list","shared","rabund", "name","sabund","size","groups","label","outputdir","inputdir"};
16                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
17                 return myArray;
18         }
19         catch(exception& e) {
20                 m->errorOut(e, "SubSampleCommand", "getValidParameters");
21                 exit(1);
22         }
23 }
24 //**********************************************************************************************************************
25 SubSampleCommand::SubSampleCommand(){   
26         try {
27                 abort = true;
28                 //initialize outputTypes
29                 vector<string> tempOutNames;
30                 outputTypes["shared"] = tempOutNames;
31                 outputTypes["list"] = tempOutNames;
32                 outputTypes["rabund"] = tempOutNames;
33                 outputTypes["sabund"] = tempOutNames;
34                 outputTypes["fasta"] = tempOutNames;
35                 outputTypes["name"] = tempOutNames;
36                 outputTypes["group"] = tempOutNames;
37         }
38         catch(exception& e) {
39                 m->errorOut(e, "SubSampleCommand", "GetRelAbundCommand");
40                 exit(1);
41         }
42 }
43 //**********************************************************************************************************************
44 vector<string> SubSampleCommand::getRequiredParameters(){       
45         try {
46                 string Array[] =  {"fasta","list","shared","rabund", "sabund","or"};
47                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
48                 return myArray;
49         }
50         catch(exception& e) {
51                 m->errorOut(e, "SubSampleCommand", "getRequiredParameters");
52                 exit(1);
53         }
54 }
55 //**********************************************************************************************************************
56 vector<string> SubSampleCommand::getRequiredFiles(){    
57         try {
58                 vector<string> myArray;
59                 return myArray;
60         }
61         catch(exception& e) {
62                 m->errorOut(e, "SubSampleCommand", "getRequiredFiles");
63                 exit(1);
64         }
65 }
66 //**********************************************************************************************************************
67 SubSampleCommand::SubSampleCommand(string option) {
68         try {
69                 globaldata = GlobalData::getInstance();
70                 abort = false;
71                 allLines = 1;
72                 labels.clear();
73                 
74                 //allow user to run help
75                 if(option == "help") { help(); abort = true; }
76                 
77                 else {
78                         //valid paramters for this command
79                         string Array[] =  {"fasta", "group", "list","shared","rabund", "sabund","name","size","groups","label","outputdir","inputdir"};
80                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
81                         
82                         OptionParser parser(option);
83                         map<string,string> parameters = parser.getParameters();
84                         
85                         ValidParameters validParameter;
86                         
87                         //check to make sure all parameters are valid for command
88                         map<string,string>::iterator it;
89                         for (it = parameters.begin(); it != parameters.end(); it++) { 
90                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
91                         }
92                         
93                         //initialize outputTypes
94                         vector<string> tempOutNames;
95                         outputTypes["shared"] = tempOutNames;
96                         outputTypes["list"] = tempOutNames;
97                         outputTypes["rabund"] = tempOutNames;
98                         outputTypes["sabund"] = tempOutNames;
99                         outputTypes["fasta"] = tempOutNames;
100                         outputTypes["name"] = tempOutNames;
101                         outputTypes["group"] = tempOutNames;
102                                         
103                         //if the user changes the output directory command factory will send this info to us in the output parameter 
104                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
105                         
106                         //if the user changes the input directory command factory will send this info to us in the output parameter 
107                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
108                         if (inputDir == "not found"){   inputDir = "";          }
109                         else {
110                                 string path;
111                                 it = parameters.find("list");
112                                 //user has given a template file
113                                 if(it != parameters.end()){ 
114                                         path = m->hasPath(it->second);
115                                         //if the user has not given a path then, add inputdir. else leave path alone.
116                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
117                                 }
118                                 
119                                 it = parameters.find("fasta");
120                                 //user has given a template file
121                                 if(it != parameters.end()){ 
122                                         path = m->hasPath(it->second);
123                                         //if the user has not given a path then, add inputdir. else leave path alone.
124                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
125                                 }
126                                 
127                                 it = parameters.find("shared");
128                                 //user has given a template file
129                                 if(it != parameters.end()){ 
130                                         path = m->hasPath(it->second);
131                                         //if the user has not given a path then, add inputdir. else leave path alone.
132                                         if (path == "") {       parameters["shared"] = inputDir + it->second;           }
133                                 }
134                                 
135                                 it = parameters.find("group");
136                                 //user has given a template file
137                                 if(it != parameters.end()){ 
138                                         path = m->hasPath(it->second);
139                                         //if the user has not given a path then, add inputdir. else leave path alone.
140                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
141                                 }
142                                 
143                                 it = parameters.find("sabund");
144                                 //user has given a template file
145                                 if(it != parameters.end()){ 
146                                         path = m->hasPath(it->second);
147                                         //if the user has not given a path then, add inputdir. else leave path alone.
148                                         if (path == "") {       parameters["sabund"] = inputDir + it->second;           }
149                                 }
150                                 
151                                 it = parameters.find("rabund");
152                                 //user has given a template file
153                                 if(it != parameters.end()){ 
154                                         path = m->hasPath(it->second);
155                                         //if the user has not given a path then, add inputdir. else leave path alone.
156                                         if (path == "") {       parameters["rabund"] = inputDir + it->second;           }
157                                 }
158                                 
159                                 it = parameters.find("name");
160                                 //user has given a template file
161                                 if(it != parameters.end()){ 
162                                         path = m->hasPath(it->second);
163                                         //if the user has not given a path then, add inputdir. else leave path alone.
164                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
165                                 }
166                         }
167                         
168                         //check for required parameters
169                         listfile = validParameter.validFile(parameters, "list", true);
170                         if (listfile == "not open") { listfile = ""; abort = true; }
171                         else if (listfile == "not found") { listfile = ""; }    
172                         
173                         sabundfile = validParameter.validFile(parameters, "sabund", true);
174                         if (sabundfile == "not open") { sabundfile = ""; abort = true; }        
175                         else if (sabundfile == "not found") { sabundfile = ""; }
176                         
177                         rabundfile = validParameter.validFile(parameters, "rabund", true);
178                         if (rabundfile == "not open") { rabundfile = ""; abort = true; }        
179                         else if (rabundfile == "not found") { rabundfile = ""; }
180                         
181                         fastafile = validParameter.validFile(parameters, "fasta", true);
182                         if (fastafile == "not open") { fastafile = ""; abort = true; }  
183                         else if (fastafile == "not found") { fastafile = ""; }
184                         
185                         sharedfile = validParameter.validFile(parameters, "shared", true);
186                         if (sharedfile == "not open") { sharedfile = ""; abort = true; }        
187                         else if (sharedfile == "not found") { sharedfile = ""; }
188                         
189                         namefile = validParameter.validFile(parameters, "name", true);
190                         if (namefile == "not open") { namefile = ""; abort = true; }    
191                         else if (namefile == "not found") { namefile = ""; }
192                         
193                         groupfile = validParameter.validFile(parameters, "group", true);
194                         if (groupfile == "not open") { groupfile = ""; abort = true; }  
195                         else if (groupfile == "not found") { groupfile = ""; }
196                         
197                         
198                         //check for optional parameter and set defaults
199                         // ...at some point should added some additional type checking...
200                         label = validParameter.validFile(parameters, "label", false);                   
201                         if (label == "not found") { label = ""; }
202                         else { 
203                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
204                                 else { allLines = 1;  }
205                         }
206                         
207                         groups = validParameter.validFile(parameters, "groups", false);                 
208                         if (groups == "not found") { groups = ""; pickedGroups = false; }
209                         else { 
210                                 pickedGroups = true;
211                                 m->splitAtDash(groups, Groups);
212                                 globaldata->Groups = Groups;
213                         }
214                         
215                         string temp = validParameter.validFile(parameters, "size", false);              if (temp == "not found"){       temp = "0";             }
216                         convert(temp, size);  
217                         
218                         if ((namefile != "") && (fastafile == "")) { m->mothurOut("You may only use a namefile with a fastafile."); m->mothurOutEndLine(); abort = true; }
219                         
220                         if ((fastafile == "") && (listfile == "") && (sabundfile == "") && (rabundfile == "") && (sharedfile == "")) {
221                                 m->mothurOut("You must provide a fasta, list, sabund, rabund or shared file as an input file."); m->mothurOutEndLine(); abort = true; }
222                         
223                         if (pickedGroups && ((groupfile == "") && (sharedfile == ""))) { 
224                                 m->mothurOut("You cannot pick groups without a valid group file or shared file."); m->mothurOutEndLine(); abort = true; }
225                         
226                         if ((groupfile != "") && ((fastafile == "") && (listfile == ""))) { 
227                                 m->mothurOut("Group file only valid with listfile or fastafile."); m->mothurOutEndLine(); abort = true; }
228                         
229                 }
230
231         }
232         catch(exception& e) {
233                 m->errorOut(e, "SubSampleCommand", "SubSampleCommand");
234                 exit(1);
235         }
236 }
237
238 //**********************************************************************************************************************
239
240 void SubSampleCommand::help(){
241         try {
242                 m->mothurOut("The sub.sample command is designed to be used as a way to normalize your data, or create a smaller set from your original set.\n");
243                 m->mothurOut("The sub.sample command parameters are fasta, name, list, group, rabund, sabund, shared, groups, size and label.  You must provide a fasta, list, sabund, rabund or shared file as an input file.\n");
244                 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n");
245                 m->mothurOut("The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n");
246                 m->mothurOut("The size parameter allows you indicate the size of your subsample.\n");
247                 m->mothurOut("The size parameter is not set: with shared file size=number of seqs in smallest sample, with all other files, 10% of number of seqs.\n");
248                 m->mothurOut("The sub.sample command should be in the following format: sub.sample(list=yourListFile, group=yourGroupFile, groups=yourGroups, label=yourLabels).\n");
249                 m->mothurOut("Example sub.sample(list=abrecovery.fn.list, group=abrecovery.groups, groups=B-C, size=20).\n");
250                 m->mothurOut("The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n");
251                 m->mothurOut("The sub.sample command outputs a .subsample file.\n");
252                 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
253
254         }
255         catch(exception& e) {
256                 m->errorOut(e, "SubSampleCommand", "help");
257                 exit(1);
258         }
259 }
260
261 //**********************************************************************************************************************
262
263 SubSampleCommand::~SubSampleCommand(){}
264
265 //**********************************************************************************************************************
266
267 int SubSampleCommand::execute(){
268         try {
269         
270                 if (abort == true) { return 0; }
271                 
272                 if (sharedfile != "")   {   getSubSampleShared();       }
273                 //if (listfile != "")           {   getSubSampleList();         }
274                 //if (rabund != "")             {   getSubSampleRabund();       }
275                 //if (sabundfile != "") {   getSubSampleSabund();       }
276                 //if (fastafile != "")  {   getSubSampleFasta();        }
277                                 
278                 m->mothurOutEndLine();
279                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
280                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
281                 m->mothurOutEndLine();
282                 
283                 return 0;
284         }
285         catch(exception& e) {
286                 m->errorOut(e, "SubSampleCommand", "execute");
287                 exit(1);
288         }
289 }
290 //**********************************************************************************************************************
291 int SubSampleCommand::getSubSampleShared() {
292         try {
293                 
294                 string thisOutputDir = outputDir;
295                 if (outputDir == "") {  thisOutputDir += m->hasPath(sharedfile);  }
296                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + "subsample" + m->getExtension(sharedfile);
297                 
298                 ofstream out;
299                 m->openOutputFile(outputFileName, out);
300                 outputTypes["shared"].push_back(outputFileName);  outputNames.push_back(outputFileName);
301                 
302                 InputData* input = new InputData(sharedfile, "sharedfile");
303                 vector<SharedRAbundVector*> lookup = input->getSharedRAbundVectors();
304                 string lastLabel = lookup[0]->getLabel();
305         
306                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
307                 set<string> processedLabels;
308                 set<string> userLabels = labels;
309
310         
311                 //as long as you are not at the end of the file or done wih the lines you want
312                 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
313                         if (m->control_pressed) {  out.close(); return 0;  }
314         
315                         if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){                  
316
317                                 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
318                                 
319                                 processShared(lookup, out);
320                                                                                                                                 
321                                 processedLabels.insert(lookup[0]->getLabel());
322                                 userLabels.erase(lookup[0]->getLabel());
323                         }
324                         
325                         if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
326                                 string saveLabel = lookup[0]->getLabel();
327                 
328                                 for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }  
329                 
330                                 lookup = input->getSharedRAbundVectors(lastLabel);
331                                 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
332                                 
333                                 processShared(lookup, out);
334                                 
335                                 processedLabels.insert(lookup[0]->getLabel());
336                                 userLabels.erase(lookup[0]->getLabel());
337                                 
338                                 //restore real lastlabel to save below
339                                 lookup[0]->setLabel(saveLabel);
340                         }
341                         
342                         lastLabel = lookup[0]->getLabel();
343                         //prevent memory leak
344                         for (int i = 0; i < lookup.size(); i++) {  delete lookup[i]; lookup[i] = NULL; }
345                                                 
346                         //get next line to process
347                         lookup = input->getSharedRAbundVectors();                               
348                 }
349                 
350                 
351                 if (m->control_pressed) {  out.close(); return 0;  }
352
353                 //output error messages about any remaining user labels
354                 set<string>::iterator it;
355                 bool needToRun = false;
356                 for (it = userLabels.begin(); it != userLabels.end(); it++) {  
357                         m->mothurOut("Your file does not include the label " + *it); 
358                         if (processedLabels.count(lastLabel) != 1) {
359                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
360                                 needToRun = true;
361                         }else {
362                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
363                         }
364                 }
365         
366                 //run last label if you need to
367                 if (needToRun == true)  {
368                         for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }  
369                         lookup = input->getSharedRAbundVectors(lastLabel);
370                         
371                         m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
372                         
373                         processShared(lookup, out);
374                         
375                         for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
376                 }
377         
378                 out.close();  
379                                 
380                 return 0;
381  
382         }
383         catch(exception& e) {
384                 m->errorOut(e, "SubSampleCommand", "getSubSampleShared");
385                 exit(1);
386         }
387 }
388 //**********************************************************************************************************************
389 int SubSampleCommand::processShared(vector<SharedRAbundVector*>& thislookup, ofstream& out) {
390         try {
391                 
392                 if (pickedGroups) { eliminateZeroOTUS(thislookup); }
393                 
394                 if (size == 0) { //user has not set size, set size = smallest samples size
395                         size = thislookup[0]->getNumSeqs();
396                         for (int i = 1; i < thislookup.size(); i++) {
397                                 int thisSize = thislookup[i]->getNumSeqs();
398                                 
399                                 if (thisSize < size) {  size = thisSize;        }
400                         }
401                 }
402                 
403                 int numBins = thislookup[0]->getNumBins();
404                 for (int i = 0; i < thislookup.size(); i++) {           
405                         int thisSize = thislookup[i]->getNumSeqs();
406                                 
407                         if (thisSize != size) {
408                                 
409                                 string thisgroup = thislookup[i]->getGroup();
410         
411                                 OrderVector* order = new OrderVector();
412                                 for(int p=0;p<numBins;p++){
413                                         for(int j=0;j<thislookup[i]->getAbundance(p);j++){
414                                                 order->push_back(p);
415                                         }
416                                 }
417                                 random_shuffle(order->begin(), order->end());
418                                 
419                                 SharedRAbundVector* temp = new SharedRAbundVector(numBins);
420                                 temp->setLabel(thislookup[i]->getLabel());
421                                 temp->setGroup(thislookup[i]->getGroup());
422                                 
423                                 delete thislookup[i];
424                                 thislookup[i] = temp;
425                                 
426                                 
427                                 for (int j = 0; j < size; j++) {
428                                         //get random number to sample from order between 0 and thisSize-1.
429                                         int myrand = (int)((float)(rand()) / (RAND_MAX / (thisSize-1) + 1));
430                                         
431                                         int bin = order->get(myrand);
432                                         
433                                         int abund = thislookup[i]->getAbundance(bin);
434                                         thislookup[i]->set(bin, (abund+1), thisgroup);
435                                 }       
436                                 delete order;
437                         }
438                 }
439                 
440                 //subsampling may have created some otus with no sequences in them
441                 eliminateZeroOTUS(thislookup);
442                 
443                 for (int i = 0; i < thislookup.size(); i++) {
444                         out << thislookup[i]->getLabel() << '\t' << thislookup[i]->getGroup() << '\t';
445                         thislookup[i]->print(out);
446                 }
447                 
448                 return 0;
449                 
450         }
451         catch(exception& e) {
452                 m->errorOut(e, "SubSampleCommand", "eliminateZeroOTUS");
453                 exit(1);
454         }
455 }
456 //**********************************************************************************************************************
457 int SubSampleCommand::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
458         try {
459                 
460                 vector<SharedRAbundVector*> newLookup;
461                 for (int i = 0; i < thislookup.size(); i++) {
462                         SharedRAbundVector* temp = new SharedRAbundVector();
463                         temp->setLabel(thislookup[i]->getLabel());
464                         temp->setGroup(thislookup[i]->getGroup());
465                         newLookup.push_back(temp);
466                 }
467                 
468                 //for each bin
469                 for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
470                         if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) {  delete newLookup[j];  } return 0; }
471                         
472                         //look at each sharedRabund and make sure they are not all zero
473                         bool allZero = true;
474                         for (int j = 0; j < thislookup.size(); j++) {
475                                 if (thislookup[j]->getAbundance(i) != 0) { allZero = false;  break;  }
476                         }
477                         
478                         //if they are not all zero add this bin
479                         if (!allZero) {
480                                 for (int j = 0; j < thislookup.size(); j++) {
481                                         newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
482                                 }
483                         }
484                 }
485                 
486                 for (int j = 0; j < thislookup.size(); j++) {  delete thislookup[j];  }
487                 thislookup.clear();
488                 
489                 thislookup = newLookup;
490                 
491                 return 0;
492                 
493         }
494         catch(exception& e) {
495                 m->errorOut(e, "SubSampleCommand", "eliminateZeroOTUS");
496                 exit(1);
497         }
498 }
499
500 //**********************************************************************************************************************
501
502
503