]> git.donarmstrong.com Git - mothur.git/blobdiff - clustersplitcommand.cpp
modified reportfile class
[mothur.git] / clustersplitcommand.cpp
index 050a615253767622c04ac26ff1b747d1e50eed28..cb3fc40aaa6e42a0e802d0f47ec3f8897cc4fa61 100644 (file)
 #include "readmatrix.hpp"
 #include "inputdata.h"
 
+
+//**********************************************************************************************************************
+vector<string> ClusterSplitCommand::getValidParameters(){      
+       try {
+               string AlignArray[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
+               vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
+               return myArray;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClusterSplitCommand", "getValidParameters");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+ClusterSplitCommand::ClusterSplitCommand(){    
+       try {
+               abort = true; calledHelp = true; 
+               vector<string> tempOutNames;
+               outputTypes["list"] = tempOutNames;
+               outputTypes["rabund"] = tempOutNames;
+               outputTypes["sabund"] = tempOutNames;
+               outputTypes["column"] = tempOutNames;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+vector<string> ClusterSplitCommand::getRequiredParameters(){   
+       try {
+               string Array[] =  {"fasta","phylip","column","or"};
+               vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
+               return myArray;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClusterSplitCommand", "getRequiredParameters");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+vector<string> ClusterSplitCommand::getRequiredFiles(){        
+       try {
+               vector<string> myArray;
+               return myArray;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClusterSplitCommand", "getRequiredFiles");
+               exit(1);
+       }
+}
 //**********************************************************************************************************************
 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
 ClusterSplitCommand::ClusterSplitCommand(string option)  {
        try{
                globaldata = GlobalData::getInstance();
-               abort = false;
+               abort = false; calledHelp = false;   
                format = "";
                
                //allow user to run help
-               if(option == "help") { help(); abort = true; }
+               if(option == "help") { help(); abort = true; calledHelp = true; }
                
                else {
                        //valid paramters for this command
@@ -44,6 +95,13 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                                }
                        }
                        
+                       //initialize outputTypes
+                       vector<string> tempOutNames;
+                       outputTypes["list"] = tempOutNames;
+                       outputTypes["rabund"] = tempOutNames;
+                       outputTypes["sabund"] = tempOutNames;
+                       outputTypes["column"] = tempOutNames;
+                       
                        globaldata->newRead();
                        
                        //if the user changes the output directory command factory will send this info to us in the output parameter 
@@ -206,7 +264,7 @@ void ClusterSplitCommand::help(){
                m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
                m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
                m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
-               m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n");
+               m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n");
                m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
                #ifdef USE_MPI
                m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
@@ -231,7 +289,7 @@ ClusterSplitCommand::~ClusterSplitCommand(){}
 int ClusterSplitCommand::execute(){
        try {
        
-               if (abort == true) {    return 0;       }
+               if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
                
                time_t estart;
                vector<string> listFileNames;
@@ -292,7 +350,7 @@ int ClusterSplitCommand::execute(){
                SplitMatrix* split;
                if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
                else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
-               else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, splitmethod, processors, outputDir);      }
+               else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir);      }
                else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
                
                split->split();
@@ -303,6 +361,10 @@ int ClusterSplitCommand::execute(){
                vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
                delete split;
                
+               //output a merged distance file
+               if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
+                       
+                               
                if (m->control_pressed) { return 0; }
                
                m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
@@ -317,7 +379,6 @@ int ClusterSplitCommand::execute(){
                                        
                        //for each file group figure out which process will complete it
                        //want to divide the load intelligently so the big files are spread between processes
-                       int count = 1;
                        for (int i = 0; i < distName.size(); i++) { 
                                int processToAssign = (i+1) % processors; 
                                if (processToAssign == 0) { processToAssign = processors; }
@@ -475,7 +536,6 @@ int ClusterSplitCommand::execute(){
                                        
                                        //for each file group figure out which process will complete it
                                        //want to divide the load intelligently so the big files are spread between processes
-                                       int count = 1;
                                        for (int i = 0; i < distName.size(); i++) { 
                                                int processToAssign = (i+1) % processors; 
                                                if (processToAssign == 0) { processToAssign = processors; }
@@ -696,9 +756,9 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> us
                m->openOutputFile(fileroot+ tag + ".rabund",    outRabund);
                m->openOutputFile(fileroot+ tag + ".list",              outList);
                                
-               outputNames.push_back(fileroot+ tag + ".sabund");
-               outputNames.push_back(fileroot+ tag + ".rabund");
-               outputNames.push_back(fileroot+ tag + ".list");
+               outputNames.push_back(fileroot+ tag + ".sabund");  outputTypes["list"].push_back(fileroot+ tag + ".list");
+               outputNames.push_back(fileroot+ tag + ".rabund");  outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
+               outputNames.push_back(fileroot+ tag + ".list");    outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
                
                map<float, int>::iterator itLabel;
 
@@ -828,7 +888,11 @@ int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> >
                                outLabels.close();
 
                                exit(0);
-                       }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
+                       }else { 
+                               m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
+                               for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+                               exit(0);
+                       }
                }
                
                //force parent to wait until all the processes are done
@@ -924,9 +988,7 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
                        m->openOutputFile(fileroot+ tag + ".list",      listFile);
                
                        listFileNames.push_back(fileroot+ tag + ".list");
-               
-                       time_t estart = time(NULL);
-                       
+                               
                        float previousDist = 0.00000;
                        float rndPreviousDist = 0.00000;
                        
@@ -1016,5 +1078,45 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
 
 
 }
+//**********************************************************************************************************************
 
+int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
+       try{
+               
+#ifdef USE_MPI
+               int pid;
+               MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+               
+               if (pid != 0) {
+#endif
+               
+               string thisOutputDir = outputDir;
+               if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
+               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
+               remove(outputFileName.c_str());
+               
+               
+               for (int i = 0; i < distNames.size(); i++) {
+                       if (m->control_pressed) {  return 0; }
+                       
+                       string thisDistFile = distNames[i].begin()->first;
+                       
+                       m->appendFiles(thisDistFile, outputFileName);
+               }       
+                       
+               outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
+                       
+#ifdef USE_MPI
+               }
+#endif
+                               
+               return 0;       
+               
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
+               exit(1);
+       }
+}
 //**********************************************************************************************************************