]> git.donarmstrong.com Git - mothur.git/blobdiff - subsamplecommand.cpp
fixed metastats, added resize to cluster.classic, added code to kill children if...
[mothur.git] / subsamplecommand.cpp
index 04f6c7cac931576b696f3dc9d369e171b1b9d525..036341ff48f9de13d6684fe9632da08742ee7d82 100644 (file)
@@ -12,7 +12,7 @@
 //**********************************************************************************************************************
 vector<string> SubSampleCommand::getValidParameters(){ 
        try {
-               string Array[] =  {"groups","label","outputdir","inputdir"};
+               string Array[] =  {"fasta", "group", "list","shared","rabund", "name","sabund","size","groups","label","outputdir","inputdir"};
                vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                return myArray;
        }
@@ -31,6 +31,9 @@ SubSampleCommand::SubSampleCommand(){
                outputTypes["list"] = tempOutNames;
                outputTypes["rabund"] = tempOutNames;
                outputTypes["sabund"] = tempOutNames;
+               outputTypes["fasta"] = tempOutNames;
+               outputTypes["name"] = tempOutNames;
+               outputTypes["group"] = tempOutNames;
        }
        catch(exception& e) {
                m->errorOut(e, "SubSampleCommand", "GetRelAbundCommand");
@@ -40,7 +43,8 @@ SubSampleCommand::SubSampleCommand(){
 //**********************************************************************************************************************
 vector<string> SubSampleCommand::getRequiredParameters(){      
        try {
-               vector<string> myArray;
+               string Array[] =  {"fasta","list","shared","rabund", "sabund","or"};
+               vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                return myArray;
        }
        catch(exception& e) {
@@ -51,8 +55,7 @@ vector<string> SubSampleCommand::getRequiredParameters(){
 //**********************************************************************************************************************
 vector<string> SubSampleCommand::getRequiredFiles(){   
        try {
-               string Array[] =  {"shared","list","rabund","sabund","or"};
-               vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
+               vector<string> myArray;
                return myArray;
        }
        catch(exception& e) {
@@ -73,8 +76,8 @@ SubSampleCommand::SubSampleCommand(string option) {
                
                else {
                        //valid paramters for this command
-                       string AlignArray[] =  {"groups","label","outputdir","inputdir"};
-                       vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
+                       string Array[] =  {"fasta", "group", "list","shared","rabund", "sabund","name","size","groups","label","outputdir","inputdir"};
+                       vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                        
                        OptionParser parser(option);
                        map<string,string> parameters = parser.getParameters();
@@ -82,7 +85,8 @@ SubSampleCommand::SubSampleCommand(string option) {
                        ValidParameters validParameter;
                        
                        //check to make sure all parameters are valid for command
-                       for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
+                       map<string,string>::iterator it;
+                       for (it = parameters.begin(); it != parameters.end(); it++) { 
                                if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
                        }
                        
@@ -92,16 +96,105 @@ SubSampleCommand::SubSampleCommand(string option) {
                        outputTypes["list"] = tempOutNames;
                        outputTypes["rabund"] = tempOutNames;
                        outputTypes["sabund"] = tempOutNames;
+                       outputTypes["fasta"] = tempOutNames;
+                       outputTypes["name"] = tempOutNames;
+                       outputTypes["group"] = tempOutNames;
                                        
                        //if the user changes the output directory command factory will send this info to us in the output parameter 
-                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
-                               outputDir = ""; 
-                               outputDir += m->hasPath(globaldata->inputFileName); //if user entered a file with a path then preserve it       
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
+                       
+                       //if the user changes the input directory command factory will send this info to us in the output parameter 
+                       string inputDir = validParameter.validFile(parameters, "inputdir", false);              
+                       if (inputDir == "not found"){   inputDir = "";          }
+                       else {
+                               string path;
+                               it = parameters.find("list");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["list"] = inputDir + it->second;             }
+                               }
+                               
+                               it = parameters.find("fasta");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
+                               }
+                               
+                               it = parameters.find("shared");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["shared"] = inputDir + it->second;           }
+                               }
+                               
+                               it = parameters.find("group");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["group"] = inputDir + it->second;            }
+                               }
+                               
+                               it = parameters.find("sabund");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["sabund"] = inputDir + it->second;           }
+                               }
+                               
+                               it = parameters.find("rabund");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["rabund"] = inputDir + it->second;           }
+                               }
+                               
+                               it = parameters.find("name");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["name"] = inputDir + it->second;             }
+                               }
                        }
                        
-                       //make sure the user has already run the read.otu command
-                       if ((globaldata->getSharedFile() == "") && (globaldata->getListFile() == "") && (globaldata->getRabundFile() == "") && (globaldata->getSabundFile() == "")) { m->mothurOut("You must read a list, sabund, rabund or shared file before you can use the sub.sample command."); m->mothurOutEndLine(); abort = true; }
-
+                       //check for required parameters
+                       listfile = validParameter.validFile(parameters, "list", true);
+                       if (listfile == "not open") { listfile = ""; abort = true; }
+                       else if (listfile == "not found") { listfile = ""; }    
+                       
+                       sabundfile = validParameter.validFile(parameters, "sabund", true);
+                       if (sabundfile == "not open") { sabundfile = ""; abort = true; }        
+                       else if (sabundfile == "not found") { sabundfile = ""; }
+                       
+                       rabundfile = validParameter.validFile(parameters, "rabund", true);
+                       if (rabundfile == "not open") { rabundfile = ""; abort = true; }        
+                       else if (rabundfile == "not found") { rabundfile = ""; }
+                       
+                       fastafile = validParameter.validFile(parameters, "fasta", true);
+                       if (fastafile == "not open") { fastafile = ""; abort = true; }  
+                       else if (fastafile == "not found") { fastafile = ""; }
+                       
+                       sharedfile = validParameter.validFile(parameters, "shared", true);
+                       if (sharedfile == "not open") { sharedfile = ""; abort = true; }        
+                       else if (sharedfile == "not found") { sharedfile = ""; }
+                       
+                       namefile = validParameter.validFile(parameters, "name", true);
+                       if (namefile == "not open") { namefile = ""; abort = true; }    
+                       else if (namefile == "not found") { namefile = ""; }
+                       
+                       groupfile = validParameter.validFile(parameters, "group", true);
+                       if (groupfile == "not open") { groupfile = ""; abort = true; }  
+                       else if (groupfile == "not found") { groupfile = ""; }
+                       
+                       
                        //check for optional parameter and set defaults
                        // ...at some point should added some additional type checking...
                        label = validParameter.validFile(parameters, "label", false);                   
@@ -111,12 +204,6 @@ SubSampleCommand::SubSampleCommand(string option) {
                                else { allLines = 1;  }
                        }
                        
-                       //if the user has not specified any labels use the ones from read.otu
-                       if (label == "") {  
-                               allLines = globaldata->allLines; 
-                               labels = globaldata->labels; 
-                       }
-                       
                        groups = validParameter.validFile(parameters, "groups", false);                 
                        if (groups == "not found") { groups = ""; pickedGroups = false; }
                        else { 
@@ -125,6 +212,20 @@ SubSampleCommand::SubSampleCommand(string option) {
                                globaldata->Groups = Groups;
                        }
                        
+                       string temp = validParameter.validFile(parameters, "size", false);              if (temp == "not found"){       temp = "0";             }
+                       convert(temp, size);  
+                       
+                       if ((namefile != "") && (fastafile == "")) { m->mothurOut("You may only use a namefile with a fastafile."); m->mothurOutEndLine(); abort = true; }
+                       
+                       if ((fastafile == "") && (listfile == "") && (sabundfile == "") && (rabundfile == "") && (sharedfile == "")) {
+                               m->mothurOut("You must provide a fasta, list, sabund, rabund or shared file as an input file."); m->mothurOutEndLine(); abort = true; }
+                       
+                       if (pickedGroups && ((groupfile == "") && (sharedfile == ""))) { 
+                               m->mothurOut("You cannot pick groups without a valid group file or shared file."); m->mothurOutEndLine(); abort = true; }
+                       
+                       if ((groupfile != "") && ((fastafile == "") && (listfile == ""))) { 
+                               m->mothurOut("Group file only valid with listfile or fastafile."); m->mothurOutEndLine(); abort = true; }
+                       
                }
 
        }
@@ -138,15 +239,16 @@ SubSampleCommand::SubSampleCommand(string option) {
 
 void SubSampleCommand::help(){
        try {
-               m->mothurOut("The get.relabund command can only be executed after a successful read.otu command of a list and group or shared file.\n");
-               m->mothurOut("The get.relabund command parameters are groups, scale and label.  No parameters are required.\n");
+               m->mothurOut("The sub.sample command is designed to be used as a way to normalize your data, or create a smaller set from your original set.\n");
+               m->mothurOut("The sub.sample command parameters are fasta, name, list, group, rabund, sabund, shared, groups, size and label.  You must provide a fasta, list, sabund, rabund or shared file as an input file.\n");
                m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n");
                m->mothurOut("The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n");
-               m->mothurOut("The scale parameter allows you to select what scale you would like to use. Choices are totalgroup, totalotu, averagegroup, averageotu, default is totalgroup.\n");
-               m->mothurOut("The get.relabund command should be in the following format: get.relabund(groups=yourGroups, label=yourLabels).\n");
-               m->mothurOut("Example get.relabund(groups=A-B-C, scale=averagegroup).\n");
+               m->mothurOut("The size parameter allows you indicate the size of your subsample.\n");
+               m->mothurOut("The size parameter is not set: with shared file size=number of seqs in smallest sample, with all other files, 10% of number of seqs.\n");
+               m->mothurOut("The sub.sample command should be in the following format: sub.sample(list=yourListFile, group=yourGroupFile, groups=yourGroups, label=yourLabels).\n");
+               m->mothurOut("Example sub.sample(list=abrecovery.fn.list, group=abrecovery.groups, groups=B-C, size=20).\n");
                m->mothurOut("The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n");
-               m->mothurOut("The get.relabund command outputs a .relabund file.\n");
+               m->mothurOut("The sub.sample command outputs a .subsample file.\n");
                m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
 
        }
@@ -158,8 +260,7 @@ void SubSampleCommand::help(){
 
 //**********************************************************************************************************************
 
-SubSampleCommand::~SubSampleCommand(){
-}
+SubSampleCommand::~SubSampleCommand(){}
 
 //**********************************************************************************************************************
 
@@ -168,47 +269,15 @@ int SubSampleCommand::execute(){
        
                if (abort == true) { return 0; }
                
-               string outputFileName = outputDir + m->getRootName(m->getSimpleName(globaldata->inputFileName)) + "subsample" +  m->getExtension(globaldata->inputFileName);
-               ofstream out;
-               m->openOutputFile(outputFileName, out);
-               out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
-               
-               string format = globaldata->getFormat();
-               
-               read = new ReadOTUFile(globaldata->inputFileName);      
-               read->read(&*globaldata); 
-               input = globaldata->ginput;
-               
-               if (format == "sharedfile") {
-                       lookup = input->getSharedRAbundVectors();
-                       outputTypes["shared"].push_back(outputFileName);
-                       getSubSampleShared(lookup, out);
-               }else if (format == "list") { 
-                       list = globaldata->glist;
-                       outputTypes["list"].push_back(outputFileName);
-                       //getSubSamplesList();
-               }else if (format == "rabund") { 
-                       rabund = globaldata->rabund;
-                       outputTypes["rabund"].push_back(outputFileName);
-                       //getSubSamplesRabund();
-               
-               }else if (format == "sabund") { 
-                       sabund = globaldata->sabund;
-                       outputTypes["sabund"].push_back(outputFileName);
-                       //getSubSamplesSabund();
-               }
-               
-               out.close();
-                                       
-               //reset groups parameter
-               delete input; globaldata->ginput = NULL;
-               delete read;
-               
-               if (m->control_pressed) { outputTypes.clear(); remove(outputFileName.c_str()); return 0;}
-               
+               if (sharedfile != "")   {   getSubSampleShared();       }
+               //if (listfile != "")           {   getSubSampleList();         }
+               //if (rabund != "")             {   getSubSampleRabund();       }
+               //if (sabundfile != "") {   getSubSampleSabund();       }
+               //if (fastafile != "")  {   getSubSampleFasta();        }
+                               
                m->mothurOutEndLine();
                m->mothurOut("Output File Names: "); m->mothurOutEndLine();
-               m->mothurOut(outputFileName); m->mothurOutEndLine(); outputNames.push_back(outputFileName); 
+               for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
                m->mothurOutEndLine();
                
                return 0;
@@ -219,26 +288,35 @@ int SubSampleCommand::execute(){
        }
 }
 //**********************************************************************************************************************
-int SubSampleCommand::getSubSampleShared(vector<SharedRAbundVector*>& thislookup, ofstream& filename) {
+int SubSampleCommand::getSubSampleShared() {
        try {
+               
+               string thisOutputDir = outputDir;
+               if (outputDir == "") {  thisOutputDir += m->hasPath(sharedfile);  }
+               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + "subsample" + m->getExtension(sharedfile);
+               
+               ofstream out;
+               m->openOutputFile(outputFileName, out);
+               outputTypes["shared"].push_back(outputFileName);  outputNames.push_back(outputFileName);
+               
+               InputData* input = new InputData(sharedfile, "sharedfile");
+               vector<SharedRAbundVector*> lookup = input->getSharedRAbundVectors();
+               string lastLabel = lookup[0]->getLabel();
        
                //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
                set<string> processedLabels;
                set<string> userLabels = labels;
 
-               string lastLabel = lookup[0]->getLabel();
        
                //as long as you are not at the end of the file or done wih the lines you want
                while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
-                       if (m->control_pressed) {  return 0;  }
+                       if (m->control_pressed) {  out.close(); return 0;  }
        
                        if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){                  
 
                                m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
                                
-                               //process lookup
-                               
-                                                               
+                               processShared(lookup, out);
                                                                                                                                
                                processedLabels.insert(lookup[0]->getLabel());
                                userLabels.erase(lookup[0]->getLabel());
@@ -252,7 +330,8 @@ int SubSampleCommand::getSubSampleShared(vector<SharedRAbundVector*>& thislookup
                                lookup = input->getSharedRAbundVectors(lastLabel);
                                m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
                                
-                               //process lookup                                
+                               processShared(lookup, out);
+                               
                                processedLabels.insert(lookup[0]->getLabel());
                                userLabels.erase(lookup[0]->getLabel());
                                
@@ -269,7 +348,7 @@ int SubSampleCommand::getSubSampleShared(vector<SharedRAbundVector*>& thislookup
                }
                
                
-               if (m->control_pressed) {  return 0;  }
+               if (m->control_pressed) {  out.close(); return 0;  }
 
                //output error messages about any remaining user labels
                set<string>::iterator it;
@@ -291,15 +370,12 @@ int SubSampleCommand::getSubSampleShared(vector<SharedRAbundVector*>& thislookup
                        
                        m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
                        
-                       //process lookup
-                       
-                       
+                       processShared(lookup, out);
                        
                        for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
                }
        
-               //reset groups parameter
-               globaldata->Groups.clear();  
+               out.close();  
                                
                return 0;
  
@@ -309,8 +385,74 @@ int SubSampleCommand::getSubSampleShared(vector<SharedRAbundVector*>& thislookup
                exit(1);
        }
 }
-
-
+//**********************************************************************************************************************
+int SubSampleCommand::processShared(vector<SharedRAbundVector*>& thislookup, ofstream& out) {
+       try {
+               
+               if (pickedGroups) { eliminateZeroOTUS(thislookup); }
+               
+               if (size == 0) { //user has not set size, set size = smallest samples size
+                       size = thislookup[0]->getNumSeqs();
+                       for (int i = 1; i < thislookup.size(); i++) {
+                               int thisSize = thislookup[i]->getNumSeqs();
+                               
+                               if (thisSize < size) {  size = thisSize;        }
+                       }
+               }
+               
+               int numBins = thislookup[0]->getNumBins();
+               for (int i = 0; i < thislookup.size(); i++) {           
+                       int thisSize = thislookup[i]->getNumSeqs();
+                               
+                       if (thisSize != size) {
+                               
+                               string thisgroup = thislookup[i]->getGroup();
+       
+                               OrderVector* order = new OrderVector();
+                               for(int p=0;p<numBins;p++){
+                                       for(int j=0;j<thislookup[i]->getAbundance(p);j++){
+                                               order->push_back(p);
+                                       }
+                               }
+                               random_shuffle(order->begin(), order->end());
+                               
+                               SharedRAbundVector* temp = new SharedRAbundVector(numBins);
+                               temp->setLabel(thislookup[i]->getLabel());
+                               temp->setGroup(thislookup[i]->getGroup());
+                               
+                               delete thislookup[i];
+                               thislookup[i] = temp;
+                               
+                               
+                               for (int j = 0; j < size; j++) {
+                                       //get random number to sample from order between 0 and thisSize-1.
+                                       int myrand = (int)((float)(rand()) / (RAND_MAX / (thisSize-1) + 1));
+                                       
+                                       int bin = order->get(myrand);
+                                       
+                                       int abund = thislookup[i]->getAbundance(bin);
+                                       thislookup[i]->set(bin, (abund+1), thisgroup);
+                               }       
+                               delete order;
+                       }
+               }
+               
+               //subsampling may have created some otus with no sequences in them
+               eliminateZeroOTUS(thislookup);
+               
+               for (int i = 0; i < thislookup.size(); i++) {
+                       out << thislookup[i]->getLabel() << '\t' << thislookup[i]->getGroup() << '\t';
+                       thislookup[i]->print(out);
+               }
+               
+               return 0;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SubSampleCommand", "eliminateZeroOTUS");
+               exit(1);
+       }
+}
 //**********************************************************************************************************************
 int SubSampleCommand::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
        try {
@@ -326,7 +468,7 @@ int SubSampleCommand::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup)
                //for each bin
                for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
                        if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) {  delete newLookup[j];  } return 0; }
-               
+                       
                        //look at each sharedRabund and make sure they are not all zero
                        bool allZero = true;
                        for (int j = 0; j < thislookup.size(); j++) {
@@ -340,13 +482,14 @@ int SubSampleCommand::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup)
                                }
                        }
                }
-
+               
                for (int j = 0; j < thislookup.size(); j++) {  delete thislookup[j];  }
-
+               thislookup.clear();
+               
                thislookup = newLookup;
                
                return 0;
+               
        }
        catch(exception& e) {
                m->errorOut(e, "SubSampleCommand", "eliminateZeroOTUS");
@@ -357,3 +500,4 @@ int SubSampleCommand::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup)
 //**********************************************************************************************************************
 
 
+