]> git.donarmstrong.com Git - mothur.git/blobdiff - clustersplitcommand.cpp
removed unused copy constructors and comments within comments that where causing...
[mothur.git] / clustersplitcommand.cpp
index 6b5012abaa755ad9c72e61d4ece9ad488782b5bd..bb9296fe93b25e663d8f0f226dfaf8b4fadc4b54 100644 (file)
@@ -8,61 +8,86 @@
  */
 
 #include "clustersplitcommand.h"
-#include "readcluster.h"
-#include "splitmatrix.h"
-#include "readphylip.h"
-#include "readcolumn.h"
-#include "readmatrix.hpp"
-#include "inputdata.h"
+
 
 
 //**********************************************************************************************************************
-vector<string> ClusterSplitCommand::getValidParameters(){      
+vector<string> ClusterSplitCommand::setParameters(){   
        try {
-               string AlignArray[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
-               vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
+               CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
+               CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
+               CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
+               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
+               CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
+               CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
+               CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
+               CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
+               CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
+               CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
+               CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
+               CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff);
+               CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
+               CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
+               CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
+               CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+               CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+                       
+               vector<string> myArray;
+               for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
                return myArray;
        }
        catch(exception& e) {
-               m->errorOut(e, "ClusterSplitCommand", "getValidParameters");
+               m->errorOut(e, "ClusterSplitCommand", "setParameters");
                exit(1);
        }
 }
 //**********************************************************************************************************************
-ClusterSplitCommand::ClusterSplitCommand(){    
+string ClusterSplitCommand::getHelpString(){   
        try {
-               abort = true;
-               //initialize outputTypes
-               vector<string> tempOutNames;
-               outputTypes["list"] = tempOutNames;
-               outputTypes["rabund"] = tempOutNames;
-               outputTypes["sabund"] = tempOutNames;
+               string helpString = "";
+               helpString += "The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n";
+               helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
+               helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
+               helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
+               helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
+               helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
+               helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
+               helpString += "The phylip and column parameter allow you to enter your distance file. \n";
+               helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
+               helpString += "The name parameter allows you to enter your name file and is required if your distance file is in column format. \n";
+               helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
+               helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
+               helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
+               helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
+               helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
+               helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
+               helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
+#ifdef USE_MPI
+               helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
+#endif
+               helpString += "The cluster.split command should be in the following format: \n";
+               helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
+               helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";       
+               return helpString;
        }
        catch(exception& e) {
-               m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
+               m->errorOut(e, "ClusterSplitCommand", "getHelpString");
                exit(1);
        }
 }
 //**********************************************************************************************************************
-vector<string> ClusterSplitCommand::getRequiredParameters(){   
-       try {
-               string Array[] =  {"fasta","phylip","column","or"};
-               vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
-               return myArray;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "ClusterSplitCommand", "getRequiredParameters");
-               exit(1);
-       }
-}
-//**********************************************************************************************************************
-vector<string> ClusterSplitCommand::getRequiredFiles(){        
+ClusterSplitCommand::ClusterSplitCommand(){    
        try {
-               vector<string> myArray;
-               return myArray;
+               abort = true; calledHelp = true; 
+               setParameters();
+               vector<string> tempOutNames;
+               outputTypes["list"] = tempOutNames;
+               outputTypes["rabund"] = tempOutNames;
+               outputTypes["sabund"] = tempOutNames;
+               outputTypes["column"] = tempOutNames;
        }
        catch(exception& e) {
-               m->errorOut(e, "ClusterSplitCommand", "getRequiredFiles");
+               m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
                exit(1);
        }
 }
@@ -70,17 +95,15 @@ vector<string> ClusterSplitCommand::getRequiredFiles(){
 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
 ClusterSplitCommand::ClusterSplitCommand(string option)  {
        try{
-               globaldata = GlobalData::getInstance();
-               abort = false;
+               abort = false; calledHelp = false;   
                format = "";
                
                //allow user to run help
-               if(option == "help") { help(); abort = true; }
+               if(option == "help") { help(); abort = true; calledHelp = true; }
+               else if(option == "citation") { citation(); abort = true; calledHelp = true;}
                
                else {
-                       //valid paramters for this command
-                       string Array[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
-                       vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
+                       vector<string> myArray = setParameters();
                        
                        OptionParser parser(option);
                        map<string,string> parameters = parser.getParameters();
@@ -100,8 +123,7 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                        outputTypes["list"] = tempOutNames;
                        outputTypes["rabund"] = tempOutNames;
                        outputTypes["sabund"] = tempOutNames;
-                       
-                       globaldata->newRead();
+                       outputTypes["column"] = tempOutNames;
                        
                        //if the user changes the output directory command factory will send this info to us in the output parameter 
                        outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
@@ -156,36 +178,77 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                        phylipfile = validParameter.validFile(parameters, "phylip", true);
                        if (phylipfile == "not open") { abort = true; }
                        else if (phylipfile == "not found") { phylipfile = ""; }        
-                       else {  distfile = phylipfile;  format = "phylip";      }
+                       else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
                        
                        columnfile = validParameter.validFile(parameters, "column", true);
                        if (columnfile == "not open") { abort = true; } 
                        else if (columnfile == "not found") { columnfile = ""; }
-                       else {  distfile = columnfile; format = "column";       }
+                       else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
                        
                        namefile = validParameter.validFile(parameters, "name", true);
                        if (namefile == "not open") { abort = true; }   
-                       else if (namefile == "not found") { namefile = ""; }
+                       else if (namefile == "not found") { namefile = "";  }
+                       else { m->setNameFile(namefile); }
                        
                        fastafile = validParameter.validFile(parameters, "fasta", true);
                        if (fastafile == "not open") { abort = true; }  
                        else if (fastafile == "not found") { fastafile = ""; }
-                       else { distfile = fastafile;  splitmethod = "fasta";  }
+                       else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
                        
                        taxFile = validParameter.validFile(parameters, "taxonomy", true);
-                       if (taxFile == "not open") { abort = true; }    
+                       if (taxFile == "not open") { taxFile = ""; abort = true; }      
                        else if (taxFile == "not found") { taxFile = ""; }
-                       
-                       if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
+                       else {  m->setTaxonomyFile(taxFile); }
+                       
+                       if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { 
+                               //is there are current file available for either of these?
+                               //give priority to column, then phylip, then fasta
+                               columnfile = m->getColumnFile(); 
+                               if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
+                               else { 
+                                       phylipfile = m->getPhylipFile(); 
+                                       if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
+                                       else { 
+                                               fastafile = m->getFastaFile(); 
+                                               if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
+                                               else { 
+                                                       m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); 
+                                                       abort = true; 
+                                               }
+                                       }
+                               }
+                       }
                        else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
                
                        if (columnfile != "") {
-                               if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
+                               if (namefile == "") { 
+                                       namefile = m->getNameFile(); 
+                                       if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
+                                       else { 
+                                               m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); 
+                                               abort = true; 
+                                       }       
+                               }
                        }
                        
                        if (fastafile != "") {
-                               if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
-                               if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
+                               if (taxFile == "") { 
+                                       taxFile = m->getTaxonomyFile(); 
+                                       if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
+                                       else { 
+                                               m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine(); 
+                                               abort = true; 
+                                       }       
+                               }
+                               
+                               if (namefile == "") { 
+                                       namefile = m->getNameFile(); 
+                                       if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
+                                       else { 
+                                               m->mothurOut("You need to provide a namefile if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine(); 
+                                               abort = true; 
+                                       }       
+                               }
                        }
                                        
                        //check for optional parameter and set defaults
@@ -196,16 +259,17 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                        if (temp == "not found") { temp = "100"; }
                        //saves precision legnth for formatting below
                        length = temp.length();
-                       convert(temp, precision); 
+                       m->mothurConvert(temp, precision); 
                        
-                       temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
+                       temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
                        hard = m->isTrue(temp);
                        
                        temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
                        large = m->isTrue(temp);
                        
-                       temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
-                       convert(temp, processors); 
+                       temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
+                       m->setProcessors(temp);
+                       m->mothurConvert(temp, processors);
                        
                        temp = validParameter.validFile(parameters, "splitmethod", false);      
                        if (splitmethod != "fasta") {
@@ -213,14 +277,14 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                                else {  splitmethod = temp; }
                        }
                        
-                       temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
-                       convert(temp, cutoff); 
+                       temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
+                       m->mothurConvert(temp, cutoff); 
                        cutoff += (5 / (precision * 10.0));  
                        
-                       temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
-                       convert(temp, taxLevelCutoff); 
+                       temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
+                       m->mothurConvert(temp, taxLevelCutoff); 
                        
-                       method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
+                       method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
                        
                        if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
                        else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
@@ -246,49 +310,10 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
 
 //**********************************************************************************************************************
 
-void ClusterSplitCommand::help(){
-       try {
-               m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
-               m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
-               m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
-               m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n");
-               m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n");
-               m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n");
-               m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
-               m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
-               m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
-               m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
-               m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
-               m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
-               m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
-               m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
-               m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
-               m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n");
-               m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
-               #ifdef USE_MPI
-               m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
-               #endif
-               m->mothurOut("The cluster.split command should be in the following format: \n");
-               m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
-               m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");       
-
-       }
-       catch(exception& e) {
-               m->errorOut(e, "ClusterSplitCommand", "help");
-               exit(1);
-       }
-}
-
-//**********************************************************************************************************************
-
-ClusterSplitCommand::~ClusterSplitCommand(){}
-
-//**********************************************************************************************************************
-
 int ClusterSplitCommand::execute(){
        try {
        
-               if (abort == true) {    return 0;       }
+               if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
                
                time_t estart;
                vector<string> listFileNames;
@@ -360,6 +385,10 @@ int ClusterSplitCommand::execute(){
                vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
                delete split;
                
+               //output a merged distance file
+               if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
+                       
+                               
                if (m->control_pressed) { return 0; }
                
                m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
@@ -374,7 +403,6 @@ int ClusterSplitCommand::execute(){
                                        
                        //for each file group figure out which process will complete it
                        //want to divide the load intelligently so the big files are spread between processes
-                       int count = 1;
                        for (int i = 0; i < distName.size(); i++) { 
                                int processToAssign = (i+1) % processors; 
                                if (processToAssign == 0) { processToAssign = processors; }
@@ -522,73 +550,21 @@ int ClusterSplitCommand::execute(){
                MPI_Barrier(MPI_COMM_WORLD);
                
        #else
-
+               ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
+               //sanity check
+               if (processors > distName.size()) { processors = distName.size(); }
+               
                #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
                                if(processors == 1){
                                        listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
                                }else{
-                                       vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
-                                       dividedNames.resize(processors);
-                                       
-                                       //for each file group figure out which process will complete it
-                                       //want to divide the load intelligently so the big files are spread between processes
-                                       int count = 1;
-                                       for (int i = 0; i < distName.size(); i++) { 
-                                               int processToAssign = (i+1) % processors; 
-                                               if (processToAssign == 0) { processToAssign = processors; }
-                                               
-                                               dividedNames[(processToAssign-1)].push_back(distName[i]);
-                                       }
-                                       
-                                       //not lets reverse the order of ever other process, so we balance big files running with little ones
-                                       for (int i = 0; i < processors; i++) {
-                                               int remainder = ((i+1) % processors);
-                                               if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
-                                       }
-                                       
-                                       createProcesses(dividedNames);
-                                                       
-                                       if (m->control_pressed) { return 0; }
-
-                                       //get list of list file names from each process
-                                       for(int i=0;i<processors;i++){
-                                               string filename = toString(processIDS[i]) + ".temp";
-                                               ifstream in;
-                                               m->openInputFile(filename, in);
-                                               
-                                               in >> tag; m->gobble(in);
-                                               
-                                               while(!in.eof()) {
-                                                       string tempName;
-                                                       in >> tempName; m->gobble(in);
-                                                       listFileNames.push_back(tempName);
-                                               }
-                                               in.close();
-                                               remove((toString(processIDS[i]) + ".temp").c_str());
-                                               
-                                               //get labels
-                                               filename = toString(processIDS[i]) + ".temp.labels";
-                                               ifstream in2;
-                                               m->openInputFile(filename, in2);
-                                               
-                                               float tempCutoff;
-                                               in2 >> tempCutoff; m->gobble(in2);
-                                               if (tempCutoff < cutoff) { cutoff = tempCutoff; }
-                                               
-                                               while(!in2.eof()) {
-                                                       string tempName;
-                                                       in2 >> tempName; m->gobble(in2);
-                                                       if (labels.count(tempName) == 0) { labels.insert(tempName); }
-                                               }
-                                               in2.close();
-                                               remove((toString(processIDS[i]) + ".temp.labels").c_str());
-                                       }
-                               }
+                                       listFileNames = createProcesses(distName, labels);
+                }
                #else
                                listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
                #endif
        #endif  
-               if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
+               if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
                
                if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
                
@@ -605,14 +581,33 @@ int ClusterSplitCommand::execute(){
                ListVector* listSingle;
                map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
                
-               if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
+               if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
                
                mergeLists(listFileNames, labelBins, listSingle);
 
-               if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
+               if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
                
                m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
                
+               //set list file as new current listfile
+               string current = "";
+               itTypes = outputTypes.find("list");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
+               }
+               
+               //set rabund file as new current rabundfile
+               itTypes = outputTypes.find("rabund");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
+               }
+               
+               //set sabund file as new current sabundfile
+               itTypes = outputTypes.find("sabund");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
+               }
+                               
                m->mothurOutEndLine();
                m->mothurOut("Output File Names: "); m->mothurOutEndLine();
                for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
@@ -652,7 +647,7 @@ map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames,
                                listSingle->push_back(secondCol);
                        }
                        in.close();
-                       remove(singleton.c_str());
+                       m->mothurRemove(singleton);
                        
                        numSingleBins = listSingle->getNumBins();
                }else{  listSingle = NULL; numSingleBins = 0;  }
@@ -678,8 +673,8 @@ map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames,
                for (int k = 0; k < listNames.size(); k++) {
        
                        if (m->control_pressed) {  
-                               if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
-                               for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
+                               if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
+                               for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
                                return labelBin;
                        }
                        
@@ -732,7 +727,7 @@ map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames,
                        delete input;
                        
                        outFilled.close();
-                       remove(listNames[k].c_str());
+                       m->mothurRemove(listNames[k]);
                        rename(filledInList.c_str(), listNames[k].c_str());
                }
                
@@ -782,7 +777,7 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> us
                        //get the list info from each file
                        for (int k = 0; k < listNames.size(); k++) {
        
-                               if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
+                               if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } delete rabund; return 0; }
                                
                                InputData* input = new InputData(listNames[k], "list");
                                ListVector* list = input->getListVector(thisLabel);
@@ -814,7 +809,7 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> us
                
                if (listSingle != NULL) { delete listSingle;  }
                
-               for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
+               for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
                
                return 0;
        }
@@ -846,12 +841,35 @@ void ClusterSplitCommand::printData(ListVector* oldList){
        }
 }
 //**********************************************************************************************************************
-int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
+vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
        try {
+        
+        vector<string> listFiles;
+        vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
+        dividedNames.resize(processors);
+        
+        //for each file group figure out which process will complete it
+        //want to divide the load intelligently so the big files are spread between processes
+        for (int i = 0; i < distName.size(); i++) { 
+            //cout << i << endl;
+            int processToAssign = (i+1) % processors; 
+            if (processToAssign == 0) { processToAssign = processors; }
+            
+            dividedNames[(processToAssign-1)].push_back(distName[i]);
+            if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
+        }
+        
+        //not lets reverse the order of ever other process, so we balance big files running with little ones
+        for (int i = 0; i < processors; i++) {
+            //cout << i << endl;
+            int remainder = ((i+1) % processors);
+            if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
+        }
+        
+        if (m->control_pressed) { return listFiles; }
        
        #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-               int process = 0;
-               int exitCommand = 1;
+               int process = 1;
                processIDS.clear();
                
                //loop through and create all the processes you want
@@ -885,17 +903,106 @@ int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> >
                                outLabels.close();
 
                                exit(0);
-                       }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
+                       }else { 
+                               m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
+                               for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+                               exit(0);
+                       }
                }
                
+        //do your part
+        listFiles = cluster(dividedNames[0], labels);
+        
                //force parent to wait until all the processes are done
-               for (int i=0;i<processors;i++) { 
+               for (int i=0;i< processIDS.size();i++) { 
                        int temp = processIDS[i];
                        wait(&temp);
                }
+        
+        //get list of list file names from each process
+        for(int i=0;i<processIDS.size();i++){
+            string filename = toString(processIDS[i]) + ".temp";
+            ifstream in;
+            m->openInputFile(filename, in);
+            
+            in >> tag; m->gobble(in);
+            
+            while(!in.eof()) {
+                string tempName;
+                in >> tempName; m->gobble(in);
+                listFiles.push_back(tempName);
+            }
+            in.close();
+            m->mothurRemove((toString(processIDS[i]) + ".temp"));
+            
+            //get labels
+            filename = toString(processIDS[i]) + ".temp.labels";
+            ifstream in2;
+            m->openInputFile(filename, in2);
+            
+            float tempCutoff;
+            in2 >> tempCutoff; m->gobble(in2);
+            if (tempCutoff < cutoff) { cutoff = tempCutoff; }
+            
+            while(!in2.eof()) {
+                string tempName;
+                in2 >> tempName; m->gobble(in2);
+                if (labels.count(tempName) == 0) { labels.insert(tempName); }
+            }
+            in2.close();
+            m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
+        }
+        
+
+    #else
+       
+        //////////////////////////////////////////////////////////////////////////////////////////////////////
+               //Windows version shared memory, so be careful when passing variables through the clusterData struct. 
+               //Above fork() will clone, so memory is separate, but that's not the case with windows, 
+               //Taking advantage of shared memory to allow both threads to add labels.
+               //////////////////////////////////////////////////////////////////////////////////////////////////////
+               
+               vector<clusterData*> pDataArray; 
+               DWORD   dwThreadIdArray[processors-1];
+               HANDLE  hThreadArray[processors-1]; 
+               
+               //Create processor worker threads.
+               for( int i=1; i<processors; i++ ){
+                       // Allocate memory for thread data.
+                       clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
+                       pDataArray.push_back(tempCluster);
+                       processIDS.push_back(i);
+            
+                       //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
+                       //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+                       hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);  
+            
+               }
+        
+        //do your part
+        listFiles = cluster(dividedNames[0], labels);
+        
+               //Wait until all threads have terminated.
+               WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
                
-               return exitCommand;
+               //Close all thread handles and free memory allocations.
+               for(int i=0; i < pDataArray.size(); i++){
+            //get tag
+            tag = pDataArray[i]->tag;
+            //get listfiles created
+            for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
+            //get labels
+            set<string>::iterator it;
+            for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
+                       //check cutoff
+            if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
+                       CloseHandle(hThreadArray[i]);
+                       delete pDataArray[i];
+               }
+
        #endif          
+        
+        return listFiles;
        
        }
        catch(exception& e) {
@@ -907,27 +1014,24 @@ int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> >
 
 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
        try {
-               Cluster* cluster;
-               SparseMatrix* matrix;
-               ListVector* list;
-               ListVector oldList;
-               RAbundVector* rabund;
                
                vector<string> listFileNames;
-               
                double smallestCutoff = cutoff;
                
                //cluster each distance file
                for (int i = 0; i < distNames.size(); i++) {
+            
+            Cluster* cluster = NULL;
+            SparseMatrix* matrix = NULL;
+            ListVector* list = NULL;
+            ListVector oldList;
+            RAbundVector* rabund = NULL;
+            
                        if (m->control_pressed) { return listFileNames; }
                        
                        string thisNamefile = distNames[i].begin()->second;
                        string thisDistFile = distNames[i].begin()->first;
-                       
-                       //read in distance file
-                       globaldata->setNameFile(thisNamefile);
-                       globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
-                       
+                                               
                        #ifdef USE_MPI
                                int pid;
                                MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
@@ -953,8 +1057,8 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
                        oldList = *list;
                        matrix = read->getMatrix();
                        
-                       delete read; 
-                       delete nameMap; 
+                       delete read;  read = NULL;
+                       delete nameMap; nameMap = NULL;
                        
                        
                        #ifdef USE_MPI
@@ -981,9 +1085,7 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
                        m->openOutputFile(fileroot+ tag + ".list",      listFile);
                
                        listFileNames.push_back(fileroot+ tag + ".list");
-               
-                       time_t estart = time(NULL);
-                       
+                               
                        float previousDist = 0.00000;
                        float rndPreviousDist = 0.00000;
                        
@@ -998,7 +1100,7 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
                                if (m->control_pressed) { //clean up
                                        delete matrix; delete list;     delete cluster; delete rabund;
                                        listFile.close();
-                                       for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
+                                       for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
                                        listFileNames.clear(); return listFileNames;
                                }
                
@@ -1041,15 +1143,16 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
                        }
        
                        delete matrix; delete list;     delete cluster; delete rabund; 
+            matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
                        listFile.close();
                        
                        if (m->control_pressed) { //clean up
-                               for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
+                               for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
                                listFileNames.clear(); return listFileNames;
                        }
                        
-                       remove(thisDistFile.c_str());
-                       remove(thisNamefile.c_str());
+                       m->mothurRemove(thisDistFile);
+                       m->mothurRemove(thisNamefile);
                        
                        if (saveCutoff != cutoff) { 
                                if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
@@ -1073,5 +1176,45 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
 
 
 }
+//**********************************************************************************************************************
 
+int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
+       try{
+               
+#ifdef USE_MPI
+               int pid;
+               MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+               
+               if (pid != 0) {
+#endif
+               
+               string thisOutputDir = outputDir;
+               if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
+               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
+               m->mothurRemove(outputFileName);
+               
+               
+               for (int i = 0; i < distNames.size(); i++) {
+                       if (m->control_pressed) {  return 0; }
+                       
+                       string thisDistFile = distNames[i].begin()->first;
+                       
+                       m->appendFiles(thisDistFile, outputFileName);
+               }       
+                       
+               outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
+                       
+#ifdef USE_MPI
+               }
+#endif
+                               
+               return 0;       
+               
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
+               exit(1);
+       }
+}
 //**********************************************************************************************************************