clustersplitcommand.cpp

   1 /*
   2  *  clustersplitcommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 5/19/10.
   6  *  Copyright 2010 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "clustersplitcommand.h"
  11
  12
  13 //**********************************************************************************************************************
  14 vector<string> ClusterSplitCommand::setParameters(){
  15         try {
  16                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
  17                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
  18                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
  19                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
  20         CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
  21                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
  22                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
  23                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "","",false,false,true); parameters.push_back(psplitmethod);
  24                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
  25                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
  26         CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
  27                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
  28                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
  29                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff);
  30                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
  31                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false); parameters.push_back(pmethod);
  32                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
  33         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
  34                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
  35                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
  36
  37                 vector<string> myArray;
  38                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
  39                 return myArray;
  40         }
  41         catch(exception& e) {
  42                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
  43                 exit(1);
  44         }
  45 }
  46 //**********************************************************************************************************************
  47 string ClusterSplitCommand::getHelpString(){
  48         try {
  49                 string helpString = "";
  50                 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
  51                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
  52                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
  53                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
  54                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
  55                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
  56                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
  57                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
  58                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
  59                 helpString += "The name parameter allows you to enter your name file. \n";
  60         helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
  61         helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
  62                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
  63                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
  64                 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
  65                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
  66                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
  67                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
  68                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
  69         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic.  It is only valid with splitmethod=fasta. Default=f.\n";
  70 #ifdef USE_MPI
  71                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
  72 #endif
  73                 helpString += "The cluster.split command should be in the following format: \n";
  74                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
  75                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
  76                 return helpString;
  77         }
  78         catch(exception& e) {
  79                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
  80                 exit(1);
  81         }
  82 }
  83 //**********************************************************************************************************************
  84 string ClusterSplitCommand::getOutputPattern(string type) {
  85     try {
  86         string pattern = "";
  87
  88         if (type == "list") {  pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
  89         else if (type == "rabund") {  pattern = "[filename],[clustertag],rabund"; }
  90         else if (type == "sabund") {  pattern = "[filename],[clustertag],sabund"; }
  91         else if (type == "column") {  pattern = "[filename],dist"; }
  92         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
  93
  94         return pattern;
  95     }
  96     catch(exception& e) {
  97         m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
  98         exit(1);
  99     }
 100 }
 101 //**********************************************************************************************************************
 102 ClusterSplitCommand::ClusterSplitCommand(){
 103         try {
 104                 abort = true; calledHelp = true;
 105                 setParameters();
 106                 vector<string> tempOutNames;
 107                 outputTypes["list"] = tempOutNames;
 108                 outputTypes["rabund"] = tempOutNames;
 109                 outputTypes["sabund"] = tempOutNames;
 110                 outputTypes["column"] = tempOutNames;
 111         }
 112         catch(exception& e) {
 113                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 114                 exit(1);
 115         }
 116 }
 117 //**********************************************************************************************************************
 118 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
 119 ClusterSplitCommand::ClusterSplitCommand(string option)  {
 120         try{
 121                 abort = false; calledHelp = false;
 122                 format = "";
 123
 124                 //allow user to run help
 125                 if(option == "help") { help(); abort = true; calledHelp = true; }
 126                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
 127
 128                 else {
 129                         vector<string> myArray = setParameters();
 130
 131                         OptionParser parser(option);
 132                         map<string,string> parameters = parser.getParameters();
 133
 134                         ValidParameters validParameter("cluster.split");
 135
 136                         //check to make sure all parameters are valid for command
 137                         map<string,string>::iterator it;
 138                         for (it = parameters.begin(); it != parameters.end(); it++) {
 139                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
 140                                         abort = true;
 141                                 }
 142                         }
 143
 144                         //initialize outputTypes
 145                         vector<string> tempOutNames;
 146                         outputTypes["list"] = tempOutNames;
 147                         outputTypes["rabund"] = tempOutNames;
 148                         outputTypes["sabund"] = tempOutNames;
 149                         outputTypes["column"] = tempOutNames;
 150
 151                         //if the user changes the output directory command factory will send this info to us in the output parameter
 152                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
 153
 154                                 //if the user changes the input directory command factory will send this info to us in the output parameter
 155                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
 156                         if (inputDir == "not found"){   inputDir = "";          }
 157                         else {
 158                                 string path;
 159                                 it = parameters.find("phylip");
 160                                 //user has given a template file
 161                                 if(it != parameters.end()){
 162                                         path = m->hasPath(it->second);
 163                                         //if the user has not given a path then, add inputdir. else leave path alone.
 164                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
 165                                 }
 166
 167                                 it = parameters.find("column");
 168                                 //user has given a template file
 169                                 if(it != parameters.end()){
 170                                         path = m->hasPath(it->second);
 171                                         //if the user has not given a path then, add inputdir. else leave path alone.
 172                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
 173                                 }
 174
 175                                 it = parameters.find("name");
 176                                 //user has given a template file
 177                                 if(it != parameters.end()){
 178                                         path = m->hasPath(it->second);
 179                                         //if the user has not given a path then, add inputdir. else leave path alone.
 180                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
 181                                 }
 182
 183                                 it = parameters.find("taxonomy");
 184                                 //user has given a template file
 185                                 if(it != parameters.end()){
 186                                         path = m->hasPath(it->second);
 187                                         //if the user has not given a path then, add inputdir. else leave path alone.
 188                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
 189                                 }
 190
 191                                 it = parameters.find("fasta");
 192                                 //user has given a template file
 193                                 if(it != parameters.end()){
 194                                         path = m->hasPath(it->second);
 195                                         //if the user has not given a path then, add inputdir. else leave path alone.
 196                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
 197                                 }
 198
 199                 it = parameters.find("count");
 200                                 //user has given a template file
 201                                 if(it != parameters.end()){
 202                                         path = m->hasPath(it->second);
 203                                         //if the user has not given a path then, add inputdir. else leave path alone.
 204                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
 205                                 }
 206                         }
 207
 208                         //check for required parameters
 209                         phylipfile = validParameter.validFile(parameters, "phylip", true);
 210                         if (phylipfile == "not open") { abort = true; }
 211                         else if (phylipfile == "not found") { phylipfile = ""; }
 212                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
 213
 214                         columnfile = validParameter.validFile(parameters, "column", true);
 215                         if (columnfile == "not open") { abort = true; }
 216                         else if (columnfile == "not found") { columnfile = ""; }
 217                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
 218
 219                         namefile = validParameter.validFile(parameters, "name", true);
 220                         if (namefile == "not open") { abort = true; namefile = "";}
 221                         else if (namefile == "not found") { namefile = "";  }
 222                         else { m->setNameFile(namefile); }
 223
 224             countfile = validParameter.validFile(parameters, "count", true);
 225                         if (countfile == "not open") { abort = true; countfile = "";}
 226                         else if (countfile == "not found") { countfile = "";  }
 227                         else { m->setCountTableFile(countfile); }
 228
 229                         fastafile = validParameter.validFile(parameters, "fasta", true);
 230                         if (fastafile == "not open") { abort = true; }
 231                         else if (fastafile == "not found") { fastafile = ""; }
 232                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
 233
 234                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
 235                         if (taxFile == "not open") { taxFile = ""; abort = true; }
 236                         else if (taxFile == "not found") { taxFile = ""; }
 237                         else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
 238
 239                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
 240                                 //is there are current file available for either of these?
 241                                 //give priority to column, then phylip, then fasta
 242                                 columnfile = m->getColumnFile();
 243                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
 244                                 else {
 245                                         phylipfile = m->getPhylipFile();
 246                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
 247                                         else {
 248                                                 fastafile = m->getFastaFile();
 249                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
 250                                                 else {
 251                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
 252                                                         abort = true;
 253                                                 }
 254                                         }
 255                                 }
 256                         }
 257                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
 258
 259             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
 260
 261                         if (columnfile != "") {
 262                                 if ((namefile == "") && (countfile == "")) {
 263                                         namefile = m->getNameFile();
 264                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 265                                         else {
 266                                                 countfile = m->getCountTableFile();
 267                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
 268                         else {
 269                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
 270                             abort = true;
 271                         }
 272                                         }
 273                                 }
 274                         }
 275
 276                         if (fastafile != "") {
 277                                 if (taxFile == "") {
 278                                         taxFile = m->getTaxonomyFile();
 279                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
 280                                         else {
 281                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
 282                                                 abort = true;
 283                                         }
 284                                 }
 285
 286                                 if ((namefile == "") && (countfile == "")) {
 287                                         namefile = m->getNameFile();
 288                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 289                                         else {
 290                                                 countfile = m->getCountTableFile();
 291                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
 292                         else {
 293                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine();
 294                             abort = true;
 295                         }
 296                                         }
 297                                 }
 298                         }
 299
 300                         //check for optional parameter and set defaults
 301                         // ...at some point should added some additional type checking...
 302                         //get user cutoff and precision or use defaults
 303                         string temp;
 304                         temp = validParameter.validFile(parameters, "precision", false);
 305                         if (temp == "not found") { temp = "100"; }
 306                         //saves precision legnth for formatting below
 307                         length = temp.length();
 308                         m->mothurConvert(temp, precision);
 309
 310                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
 311                         hard = m->isTrue(temp);
 312
 313                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
 314                         large = m->isTrue(temp);
 315
 316                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
 317                         m->setProcessors(temp);
 318                         m->mothurConvert(temp, processors);
 319
 320                         temp = validParameter.validFile(parameters, "splitmethod", false);
 321                         if ((splitmethod != "fasta") && (splitmethod != "classify")) {
 322                                 if (temp == "not found")  { splitmethod = "distance"; }
 323                                 else {  splitmethod = temp; }
 324                         }
 325
 326             temp = validParameter.validFile(parameters, "classic", false);                      if (temp == "not found") { temp = "F"; }
 327                         classic = m->isTrue(temp);
 328
 329             if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
 330
 331                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
 332                         m->mothurConvert(temp, cutoff);
 333                         cutoff += (5 / (precision * 10.0));
 334
 335                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
 336                         m->mothurConvert(temp, taxLevelCutoff);
 337
 338                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
 339
 340                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
 341                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 342
 343                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
 344                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
 345
 346                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
 347
 348                         showabund = validParameter.validFile(parameters, "showabund", false);
 349                         if (showabund == "not found") { showabund = "T"; }
 350
 351             temp = validParameter.validFile(parameters, "cluster", false);  if (temp == "not found") { temp = "T"; }
 352             runCluster = m->isTrue(temp);
 353
 354                         timing = validParameter.validFile(parameters, "timing", false);
 355                         if (timing == "not found") { timing = "F"; }
 356
 357                 }
 358         }
 359         catch(exception& e) {
 360                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 361                 exit(1);
 362         }
 363 }
 364
 365 //**********************************************************************************************************************
 366
 367 int ClusterSplitCommand::execute(){
 368         try {
 369
 370                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
 371
 372                 time_t estart;
 373                 vector<string> listFileNames;
 374                 set<string> labels;
 375                 string singletonName = "";
 376                 double saveCutoff = cutoff;
 377
 378                 //****************** file prep work ******************************//
 379                 #ifdef USE_MPI
 380                         int pid;
 381                         int tag = 2001;
 382                         MPI_Status status;
 383                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
 384                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 385
 386                         if (pid == 0) { //only process 0 converts and splits
 387
 388                 #endif
 389
 390                 //if user gave a phylip file convert to column file
 391                 if (format == "phylip") {
 392                         estart = time(NULL);
 393                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
 394
 395                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
 396
 397                         NameAssignment* nameMap = NULL;
 398                         convert->setFormat("phylip");
 399                         convert->read(nameMap);
 400
 401                         if (m->control_pressed) {  delete convert;  return 0;  }
 402
 403                         distfile = convert->getOutputFile();
 404
 405                         //if no names file given with phylip file, create it
 406                         ListVector* listToMakeNameFile =  convert->getListVector();
 407                         if ((namefile == "") && (countfile == "")) {  //you need to make a namefile for split matrix
 408                                 ofstream out;
 409                                 namefile = phylipfile + ".names";
 410                                 m->openOutputFile(namefile, out);
 411                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
 412                                         string bin = listToMakeNameFile->get(i);
 413                                         out << bin << '\t' << bin << endl;
 414                                 }
 415                                 out.close();
 416                         }
 417                         delete listToMakeNameFile;
 418                         delete convert;
 419
 420                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
 421                 }
 422                 if (m->control_pressed) { return 0; }
 423
 424                 estart = time(NULL);
 425                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
 426
 427                 //split matrix into non-overlapping groups
 428                 SplitMatrix* split;
 429                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large);                                                    }
 430                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large);                                    }
 431                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);  }
 432                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
 433
 434                 split->split();
 435
 436                 if (m->control_pressed) { delete split; return 0; }
 437
 438                 singletonName = split->getSingletonNames();
 439                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
 440                 delete split;
 441
 442         if (m->debug) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); }
 443
 444                 //output a merged distance file
 445                 //if (splitmethod == "fasta")           { createMergedDistanceFile(distName); }
 446
 447                 if (m->control_pressed) { return 0; }
 448
 449                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
 450                 estart = time(NULL);
 451
 452         if (!runCluster) {
 453
 454                 m->mothurOutEndLine();
 455                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 456                 for (int i = 0; i < distName.size(); i++) {     m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine();      }
 457                 m->mothurOutEndLine();
 458                 return 0;
 459
 460         }
 461
 462                 //****************** break up files between processes and cluster each file set ******************************//
 463         #ifdef USE_MPI
 464                         ////you are process 0 from above////
 465
 466                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 467                         dividedNames.resize(processors);
 468
 469                         //for each file group figure out which process will complete it
 470                         //want to divide the load intelligently so the big files are spread between processes
 471                         for (int i = 0; i < distName.size(); i++) {
 472                                 int processToAssign = (i+1) % processors;
 473                                 if (processToAssign == 0) { processToAssign = processors; }
 474
 475                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 476                         }
 477
 478                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 479                         for (int i = 0; i < processors; i++) {
 480                                 int remainder = ((i+1) % processors);
 481                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 482                         }
 483
 484
 485                         //send each child the list of files it needs to process
 486                         for(int i = 1; i < processors; i++) {
 487                                 //send number of file pairs
 488                                 int num = dividedNames[i].size();
 489                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 490
 491                                 for (int j = 0; j < num; j++) { //send filenames to process i
 492                                         char tempDistFileName[1024];
 493                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
 494                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
 495
 496                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 497                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 498
 499                                         char tempNameFileName[1024];
 500                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
 501                                         int lengthName = (dividedNames[i][j].begin()->second).length();
 502
 503                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 504                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 505                                 }
 506                         }
 507
 508                         //process your share
 509                         listFileNames = cluster(dividedNames[0], labels);
 510
 511                         //receive the other processes info
 512                         for(int i = 1; i < processors; i++) {
 513                                 int num = dividedNames[i].size();
 514
 515                                 double tempCutoff;
 516                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
 517                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
 518
 519                                 //send list filenames to root process
 520                                 for (int j = 0; j < num; j++) {
 521                                         int lengthList = 0;
 522                                         char tempListFileName[1024];
 523
 524                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 525                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 526
 527                                         string myListFileName = tempListFileName;
 528                                         myListFileName = myListFileName.substr(0, lengthList);
 529
 530                                         listFileNames.push_back(myListFileName);
 531                                 }
 532
 533                                 //send Labels to root process
 534                                 int numLabels = 0;
 535                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 536
 537                                 for (int j = 0; j < numLabels; j++) {
 538                                         int lengthLabel = 0;
 539                                         char tempLabel[100];
 540
 541                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 542                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 543
 544                                         string myLabel = tempLabel;
 545                                         myLabel = myLabel.substr(0, lengthLabel);
 546
 547                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
 548                                 }
 549                         }
 550
 551                 }else { //you are a child process
 552                         vector < map<string, string> >  myNames;
 553
 554                         //recieve the files you need to process
 555                         //receive number of file pairs
 556                         int num = 0;
 557                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 558
 559                         myNames.resize(num);
 560
 561                         for (int j = 0; j < num; j++) { //receive filenames to process
 562                                 int lengthDist = 0;
 563                                 char tempDistFileName[1024];
 564
 565                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 566                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 567
 568                                 string myDistFileName = tempDistFileName;
 569                                 myDistFileName = myDistFileName.substr(0, lengthDist);
 570
 571                                 int lengthName = 0;
 572                                 char tempNameFileName[1024];
 573
 574                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 575                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 576
 577                                 string myNameFileName = tempNameFileName;
 578                                 myNameFileName = myNameFileName.substr(0, lengthName);
 579
 580                                 //save file name
 581                                 myNames[j][myDistFileName] = myNameFileName;
 582                         }
 583
 584                         //process them
 585                         listFileNames = cluster(myNames, labels);
 586
 587                         //send cutoff
 588                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
 589
 590                         //send list filenames to root process
 591                         for (int j = 0; j < num; j++) {
 592                                 char tempListFileName[1024];
 593                                 strcpy(tempListFileName, listFileNames[j].c_str());
 594                                 int lengthList = listFileNames[j].length();
 595
 596                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 597                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 598                         }
 599
 600                         //send Labels to root process
 601                         int numLabels = labels.size();
 602                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 603
 604                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
 605                                 char tempLabel[100];
 606                                 strcpy(tempLabel, (*it).c_str());
 607                                 int lengthLabel = (*it).length();
 608
 609                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 610                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 611                         }
 612                 }
 613
 614                 //make everyone wait
 615                 MPI_Barrier(MPI_COMM_WORLD);
 616
 617         #else
 618                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
 619                 //sanity check
 620                 if (processors > distName.size()) { processors = distName.size(); }
 621
 622                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 623                                 if(processors == 1){
 624                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 625                                 }else{
 626                                         listFileNames = createProcesses(distName, labels);
 627                 }
 628                 #else
 629                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 630                 #endif
 631         #endif
 632                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
 633
 634                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
 635
 636                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
 637
 638                 //****************** merge list file and create rabund and sabund files ******************************//
 639                 estart = time(NULL);
 640                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
 641
 642                 #ifdef USE_MPI
 643                         if (pid == 0) { //only process 0 merges
 644                 #endif
 645
 646                 ListVector* listSingle;
 647                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 648
 649                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 650
 651                 mergeLists(listFileNames, labelBins, listSingle);
 652
 653                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 654
 655                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 656
 657                 //set list file as new current listfile
 658                 string current = "";
 659                 itTypes = outputTypes.find("list");
 660                 if (itTypes != outputTypes.end()) {
 661                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
 662                 }
 663
 664                 //set rabund file as new current rabundfile
 665                 itTypes = outputTypes.find("rabund");
 666                 if (itTypes != outputTypes.end()) {
 667                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
 668                 }
 669
 670                 //set sabund file as new current sabundfile
 671                 itTypes = outputTypes.find("sabund");
 672                 if (itTypes != outputTypes.end()) {
 673                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
 674                 }
 675
 676                 m->mothurOutEndLine();
 677                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 678                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
 679                 m->mothurOutEndLine();
 680
 681                 #ifdef USE_MPI
 682                         } //only process 0 merges
 683
 684                         //make everyone wait
 685                         MPI_Barrier(MPI_COMM_WORLD);
 686                 #endif
 687
 688                 return 0;
 689         }
 690         catch(exception& e) {
 691                 m->errorOut(e, "ClusterSplitCommand", "execute");
 692                 exit(1);
 693         }
 694 }
 695 //**********************************************************************************************************************
 696 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
 697         try {
 698
 699                 map<float, int> labelBin;
 700                 vector<float> orderFloat;
 701                 int numSingleBins;
 702
 703                 //read in singletons
 704                 if (singleton != "none") {
 705
 706             ifstream in;
 707             m->openInputFile(singleton, in);
 708
 709                         string firstCol, secondCol;
 710                         listSingle = new ListVector();
 711
 712             if (countfile != "") { m->getline(in); m->gobble(in); }
 713
 714                         while (!in.eof()) {
 715                                 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
 716                                 if (countfile == "") { listSingle->push_back(secondCol); }
 717                 else { listSingle->push_back(firstCol); }
 718                         }
 719
 720                         in.close();
 721                         m->mothurRemove(singleton);
 722
 723                         numSingleBins = listSingle->getNumBins();
 724                 }else{  listSingle = NULL; numSingleBins = 0;  }
 725
 726                 //go through users set and make them floats so we can sort them
 727                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
 728                         float temp = -10.0;
 729
 730                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
 731                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
 732
 733                         if (temp <= cutoff) {
 734                                 orderFloat.push_back(temp);
 735                                 labelBin[temp] = numSingleBins; //initialize numbins
 736                         }
 737                 }
 738
 739                 //sort order
 740                 sort(orderFloat.begin(), orderFloat.end());
 741                 userLabels.clear();
 742
 743                 //get the list info from each file
 744                 for (int k = 0; k < listNames.size(); k++) {
 745
 746                         if (m->control_pressed) {
 747                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
 748                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
 749                                 return labelBin;
 750                         }
 751
 752                         InputData* input = new InputData(listNames[k], "list");
 753                         ListVector* list = input->getListVector();
 754                         string lastLabel = list->getLabel();
 755
 756                         string filledInList = listNames[k] + "filledInTemp";
 757                         ofstream outFilled;
 758                         m->openOutputFile(filledInList, outFilled);
 759
 760                         //for each label needed
 761                         for(int l = 0; l < orderFloat.size(); l++){
 762
 763                                 string thisLabel;
 764                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
 765                                 else { thisLabel = toString(orderFloat[l],  length-1);  }
 766
 767                                 //this file has reached the end
 768                                 if (list == NULL) {
 769                                         list = input->getListVector(lastLabel, true);
 770                                 }else{  //do you have the distance, or do you need to fill in
 771
 772                                         float labelFloat;
 773                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
 774                                         else { convert(list->getLabel(), labelFloat); }
 775
 776                                         //check for missing labels
 777                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
 778                                                 //if its bigger get last label, otherwise keep it
 779                                                 delete list;
 780                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
 781                                         }
 782                                         lastLabel = list->getLabel();
 783                                 }
 784
 785                                 //print to new file
 786                                 list->setLabel(thisLabel);
 787                                 list->print(outFilled);
 788
 789                                 //update labelBin
 790                                 labelBin[orderFloat[l]] += list->getNumBins();
 791
 792                                 delete list;
 793
 794                                 list = input->getListVector();
 795                         }
 796
 797                         if (list != NULL) { delete list; }
 798                         delete input;
 799
 800                         outFilled.close();
 801                         m->mothurRemove(listNames[k]);
 802                         rename(filledInList.c_str(), listNames[k].c_str());
 803                 }
 804
 805                 return labelBin;
 806         }
 807         catch(exception& e) {
 808                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
 809                 exit(1);
 810         }
 811 }
 812 //**********************************************************************************************************************
 813 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
 814         try {
 815                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
 816                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
 817
 818         map<string, string> variables;
 819         variables["[filename]"] = fileroot;
 820         variables["[clustertag]"] = tag;
 821         string sabundFileName = getOutputFileName("sabund", variables);
 822         string rabundFileName = getOutputFileName("rabund", variables);
 823         if (countfile != "") { variables["[tag2]"] = "unique_list"; }
 824         string listFileName = getOutputFileName("list", variables);
 825
 826         if (countfile == "") {
 827             m->openOutputFile(sabundFileName,   outSabund);
 828             m->openOutputFile(rabundFileName,   outRabund);
 829             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
 830             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
 831
 832         }
 833                 m->openOutputFile(listFileName, outList);
 834         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
 835
 836
 837                 map<float, int>::iterator itLabel;
 838
 839                 //for each label needed
 840                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
 841
 842                         string thisLabel;
 843                         if (itLabel->first == -1) { thisLabel = "unique"; }
 844                         else { thisLabel = toString(itLabel->first,  length-1);  }
 845
 846                         //outList << thisLabel << '\t' << itLabel->second << '\t';
 847
 848             RAbundVector* rabund = NULL;
 849             ListVector completeList;
 850             completeList.setLabel(thisLabel);
 851
 852             if (countfile == "") {
 853                 rabund = new RAbundVector();
 854                 rabund->setLabel(thisLabel);
 855             }
 856
 857                         //add in singletons
 858                         if (listSingle != NULL) {
 859                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
 860                                         //outList << listSingle->get(j) << '\t';
 861                     completeList.push_back(listSingle->get(j));
 862                                         if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
 863                                 }
 864                         }
 865
 866                         //get the list info from each file
 867                         for (int k = 0; k < listNames.size(); k++) {
 868
 869                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } if (rabund != NULL) { delete rabund; } return 0; }
 870
 871                                 InputData* input = new InputData(listNames[k], "list");
 872                                 ListVector* list = input->getListVector(thisLabel);
 873
 874                                 //this file has reached the end
 875                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }
 876                                 else {
 877                                         for (int j = 0; j < list->getNumBins(); j++) {
 878                                                 //outList << list->get(j) << '\t';
 879                         completeList.push_back(list->get(j));
 880                                                 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
 881                                         }
 882                                         delete list;
 883                                 }
 884                                 delete input;
 885                         }
 886
 887             if (countfile == "") {
 888                 SAbundVector sabund = rabund->getSAbundVector();
 889                 sabund.print(outSabund);
 890                 rabund->print(outRabund);
 891             }
 892                         //outList << endl;
 893             completeList.print(outList);
 894
 895                         if (rabund != NULL) { delete rabund; }
 896                 }
 897
 898                 outList.close();
 899         if (countfile == "") {
 900             outRabund.close();
 901             outSabund.close();
 902                 }
 903                 if (listSingle != NULL) { delete listSingle;  }
 904
 905                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
 906
 907                 return 0;
 908         }
 909         catch(exception& e) {
 910                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
 911                 exit(1);
 912         }
 913 }
 914
 915 //**********************************************************************************************************************
 916
 917 void ClusterSplitCommand::printData(ListVector* oldList){
 918         try {
 919                 string label = oldList->getLabel();
 920                 RAbundVector oldRAbund = oldList->getRAbundVector();
 921
 922                 oldRAbund.setLabel(label);
 923                 if (m->isTrue(showabund)) {
 924                         oldRAbund.getSAbundVector().print(cout);
 925                 }
 926                 oldRAbund.print(outRabund);
 927                 oldRAbund.getSAbundVector().print(outSabund);
 928
 929                 oldList->print(outList);
 930         }
 931         catch(exception& e) {
 932                 m->errorOut(e, "ClusterSplitCommand", "printData");
 933                 exit(1);
 934         }
 935 }
 936 //**********************************************************************************************************************
 937 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
 938         try {
 939
 940         vector<string> listFiles;
 941         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 942         dividedNames.resize(processors);
 943
 944         //for each file group figure out which process will complete it
 945         //want to divide the load intelligently so the big files are spread between processes
 946         for (int i = 0; i < distName.size(); i++) {
 947             //cout << i << endl;
 948             int processToAssign = (i+1) % processors;
 949             if (processToAssign == 0) { processToAssign = processors; }
 950
 951             dividedNames[(processToAssign-1)].push_back(distName[i]);
 952             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
 953         }
 954
 955         //now lets reverse the order of ever other process, so we balance big files running with little ones
 956         for (int i = 0; i < processors; i++) {
 957             //cout << i << endl;
 958             int remainder = ((i+1) % processors);
 959             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 960         }
 961
 962         if (m->control_pressed) { return listFiles; }
 963
 964         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 965                 int process = 1;
 966                 processIDS.clear();
 967
 968                 //loop through and create all the processes you want
 969                 while (process != processors) {
 970                         int pid = fork();
 971
 972                         if (pid > 0) {
 973                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 974                                 process++;
 975                         }else if (pid == 0){
 976                                 set<string> labels;
 977                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
 978
 979                                 //write out names to file
 980                                 string filename = toString(getpid()) + ".temp";
 981                                 ofstream out;
 982                                 m->openOutputFile(filename, out);
 983                                 out << tag << endl;
 984                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
 985                                 out.close();
 986
 987                                 //print out labels
 988                                 ofstream outLabels;
 989                                 filename = toString(getpid()) + ".temp.labels";
 990                                 m->openOutputFile(filename, outLabels);
 991
 992                                 outLabels << cutoff << endl;
 993                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
 994                                         outLabels << (*it) << endl;
 995                                 }
 996                                 outLabels.close();
 997
 998                                 exit(0);
 999                         }else {
1000                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1001                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1002                                 exit(0);
1003                         }
1004                 }
1005
1006         //do your part
1007         listFiles = cluster(dividedNames[0], labels);
1008
1009                 //force parent to wait until all the processes are done
1010                 for (int i=0;i< processIDS.size();i++) {
1011                         int temp = processIDS[i];
1012                         wait(&temp);
1013                 }
1014
1015         //get list of list file names from each process
1016         for(int i=0;i<processIDS.size();i++){
1017             string filename = toString(processIDS[i]) + ".temp";
1018             ifstream in;
1019             m->openInputFile(filename, in);
1020
1021             in >> tag; m->gobble(in);
1022
1023             while(!in.eof()) {
1024                 string tempName;
1025                 in >> tempName; m->gobble(in);
1026                 listFiles.push_back(tempName);
1027             }
1028             in.close();
1029             m->mothurRemove((toString(processIDS[i]) + ".temp"));
1030
1031             //get labels
1032             filename = toString(processIDS[i]) + ".temp.labels";
1033             ifstream in2;
1034             m->openInputFile(filename, in2);
1035
1036             float tempCutoff;
1037             in2 >> tempCutoff; m->gobble(in2);
1038             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1039
1040             while(!in2.eof()) {
1041                 string tempName;
1042                 in2 >> tempName; m->gobble(in2);
1043                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1044             }
1045             in2.close();
1046             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1047         }
1048
1049
1050     #else
1051
1052         //////////////////////////////////////////////////////////////////////////////////////////////////////
1053                 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
1054                 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1055                 //Taking advantage of shared memory to allow both threads to add labels.
1056                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1057                 /*
1058                 vector<clusterData*> pDataArray;
1059                 DWORD   dwThreadIdArray[processors-1];
1060                 HANDLE  hThreadArray[processors-1];
1061
1062                 //Create processor worker threads.
1063                 for( int i=1; i<processors; i++ ){
1064                         // Allocate memory for thread data.
1065                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1066                         pDataArray.push_back(tempCluster);
1067                         processIDS.push_back(i);
1068
1069                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1070                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1071                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
1072
1073                 }
1074
1075         //do your part
1076         listFiles = cluster(dividedNames[0], labels);
1077
1078                 //Wait until all threads have terminated.
1079                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1080
1081                 //Close all thread handles and free memory allocations.
1082                 for(int i=0; i < pDataArray.size(); i++){
1083             //get tag
1084             tag = pDataArray[i]->tag;
1085             //get listfiles created
1086             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1087             //get labels
1088             set<string>::iterator it;
1089             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1090                         //check cutoff
1091             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1092                         CloseHandle(hThreadArray[i]);
1093                         delete pDataArray[i];
1094                 }
1095 */
1096         #endif
1097
1098         return listFiles;
1099
1100         }
1101         catch(exception& e) {
1102                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1103                 exit(1);
1104         }
1105 }
1106 //**********************************************************************************************************************
1107
1108 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1109         try {
1110
1111                 vector<string> listFileNames;
1112                 double smallestCutoff = cutoff;
1113
1114                 //cluster each distance file
1115                 for (int i = 0; i < distNames.size(); i++) {
1116
1117                         string thisNamefile = distNames[i].begin()->second;
1118                         string thisDistFile = distNames[i].begin()->first;
1119
1120                         string listFileName = "";
1121             if (classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff);   }
1122             else            {  listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff);          }
1123
1124                         if (m->control_pressed) { //clean up
1125                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1126                                 listFileNames.clear(); return listFileNames;
1127                         }
1128
1129             listFileNames.push_back(listFileName);
1130         }
1131
1132                 cutoff = smallestCutoff;
1133
1134                 return listFileNames;
1135
1136         }
1137         catch(exception& e) {
1138                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1139                 exit(1);
1140         }
1141
1142
1143 }
1144 //**********************************************************************************************************************
1145 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1146         try {
1147         string listFileName = "";
1148
1149         ListVector* list = NULL;
1150         ListVector oldList;
1151         RAbundVector* rabund = NULL;
1152
1153 #ifdef USE_MPI
1154         int pid;
1155         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1156
1157         //output your files too
1158         if (pid != 0) {
1159             cout << endl << "Reading " << thisDistFile << endl;
1160         }
1161 #endif
1162
1163         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1164
1165         //reads phylip file storing data in 2D vector, also fills list and rabund
1166         bool sim = false;
1167                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1168
1169         NameAssignment* nameMap = NULL;
1170         CountTable* ct = NULL;
1171         if(namefile != ""){
1172                         nameMap = new NameAssignment(thisNamefile);
1173                         nameMap->readMap();
1174             cluster->readPhylipFile(thisDistFile, nameMap);
1175                 }else if (countfile != "") {
1176             ct = new CountTable();
1177             ct->readTable(thisNamefile, false);
1178             cluster->readPhylipFile(thisDistFile, ct);
1179         }
1180         tag = cluster->getTag();
1181
1182                 if (m->control_pressed) { if(namefile != ""){   delete nameMap; }
1183             else { delete ct; } delete cluster; return 0; }
1184
1185                 list = cluster->getListVector();
1186                 rabund = cluster->getRAbundVector();
1187
1188                 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1189                 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1190         listFileName = fileroot+ tag + ".list";
1191
1192         ofstream listFile;
1193                 m->openOutputFile(fileroot+ tag + ".list",      listFile);
1194
1195                 float previousDist = 0.00000;
1196                 float rndPreviousDist = 0.00000;
1197                 oldList = *list;
1198
1199 #ifdef USE_MPI
1200         //output your files too
1201         if (pid != 0) {
1202             cout << endl << "Clustering " << thisDistFile << endl;
1203         }
1204 #endif
1205
1206         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1207
1208                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1209                         if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  if(namefile != ""){    delete nameMap; }
1210                 else { delete ct; } return listFileName;  }
1211
1212                         cluster->update(cutoff);
1213
1214                         float dist = cluster->getSmallDist();
1215                         float rndDist;
1216                         if (hard) {
1217                                 rndDist = m->ceilDist(dist, precision);
1218                         }else{
1219                                 rndDist = m->roundDist(dist, precision);
1220                         }
1221
1222             if(previousDist <= 0.0000 && dist != previousDist){
1223                 oldList.setLabel("unique");
1224                 oldList.print(listFile);
1225                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1226             }
1227             else if(rndDist != rndPreviousDist){
1228                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1229                 oldList.print(listFile);
1230                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1231             }
1232
1233
1234                         previousDist = dist;
1235                         rndPreviousDist = rndDist;
1236                         oldList = *list;
1237                 }
1238
1239                 if(previousDist <= 0.0000){
1240             oldList.setLabel("unique");
1241             oldList.print(listFile);
1242             if (labels.count("unique") == 0) { labels.insert("unique"); }
1243         }
1244         else if(rndPreviousDist<cutoff){
1245             oldList.setLabel(toString(rndPreviousDist,  length-1));
1246             oldList.print(listFile);
1247             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1248         }
1249
1250
1251                 listFile.close();
1252
1253                 delete cluster;  delete list; delete rabund;
1254         if(namefile != ""){     delete nameMap; }
1255         else { delete ct; }
1256
1257         m->mothurRemove(thisDistFile);
1258         m->mothurRemove(thisNamefile);
1259
1260         return listFileName;
1261
1262         }
1263         catch(exception& e) {
1264                 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1265                 exit(1);
1266         }
1267 }
1268
1269 //**********************************************************************************************************************
1270 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1271         try {
1272         string listFileName = "";
1273
1274         Cluster* cluster = NULL;
1275         SparseDistanceMatrix* matrix = NULL;
1276         ListVector* list = NULL;
1277         ListVector oldList;
1278         RAbundVector* rabund = NULL;
1279
1280         if (m->control_pressed) { return listFileName; }
1281
1282 #ifdef USE_MPI
1283         int pid;
1284         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1285
1286         //output your files too
1287         if (pid != 0) {
1288             cout << endl << "Reading " << thisDistFile << endl;
1289         }
1290 #endif
1291
1292         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1293
1294         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1295         read->setCutoff(cutoff);
1296
1297         NameAssignment* nameMap = NULL;
1298         CountTable* ct = NULL;
1299                 if(namefile != ""){
1300                         nameMap = new NameAssignment(thisNamefile);
1301                         nameMap->readMap();
1302             read->read(nameMap);
1303                 }else if (countfile != "") {
1304             ct = new CountTable();
1305             ct->readTable(thisNamefile, false);
1306             read->read(ct);
1307         }else { read->read(nameMap); }
1308
1309                 list = read->getListVector();
1310         oldList = *list;
1311                 matrix = read->getDMatrix();
1312
1313                 if(countfile != "") {
1314             rabund = new RAbundVector();
1315             createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1316             delete ct;
1317         }else { rabund = new RAbundVector(list->getRAbundVector()); }
1318
1319         delete read;  read = NULL;
1320         if (namefile != "") { delete nameMap; nameMap = NULL; }
1321
1322
1323 #ifdef USE_MPI
1324         //output your files too
1325         if (pid != 0) {
1326             cout << endl << "Clustering " << thisDistFile << endl;
1327         }
1328 #endif
1329
1330         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1331
1332         //create cluster
1333         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1334         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1335         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
1336         tag = cluster->getTag();
1337
1338         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1339         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1340
1341         ofstream listFile;
1342         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1343
1344         listFileName = fileroot+ tag + ".list";
1345
1346         float previousDist = 0.00000;
1347         float rndPreviousDist = 0.00000;
1348
1349         oldList = *list;
1350
1351         print_start = true;
1352         start = time(NULL);
1353         double saveCutoff = cutoff;
1354
1355         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1356
1357             if (m->control_pressed) { //clean up
1358                 delete matrix; delete list;     delete cluster; delete rabund;
1359                 listFile.close();
1360                 m->mothurRemove(listFileName);
1361                 return listFileName;
1362             }
1363
1364             cluster->update(saveCutoff);
1365
1366             float dist = matrix->getSmallDist();
1367             float rndDist;
1368             if (hard) {
1369                 rndDist = m->ceilDist(dist, precision);
1370             }else{
1371                 rndDist = m->roundDist(dist, precision);
1372             }
1373
1374             if(previousDist <= 0.0000 && dist != previousDist){
1375                 oldList.setLabel("unique");
1376                 oldList.print(listFile);
1377                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1378             }
1379             else if(rndDist != rndPreviousDist){
1380                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1381                 oldList.print(listFile);
1382                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1383             }
1384
1385             previousDist = dist;
1386             rndPreviousDist = rndDist;
1387             oldList = *list;
1388         }
1389
1390
1391         if(previousDist <= 0.0000){
1392             oldList.setLabel("unique");
1393             oldList.print(listFile);
1394             if (labels.count("unique") == 0) { labels.insert("unique"); }
1395         }
1396         else if(rndPreviousDist<cutoff){
1397             oldList.setLabel(toString(rndPreviousDist,  length-1));
1398             oldList.print(listFile);
1399             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1400         }
1401
1402         delete matrix; delete list;     delete cluster; delete rabund;
1403         matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1404         listFile.close();
1405
1406         if (m->control_pressed) { //clean up
1407             m->mothurRemove(listFileName);
1408             return listFileName;
1409         }
1410
1411         m->mothurRemove(thisDistFile);
1412         m->mothurRemove(thisNamefile);
1413
1414         if (saveCutoff != cutoff) {
1415             if (hard)   {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1416             else                {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1417
1418             m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1419         }
1420
1421         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1422
1423         return listFileName;
1424
1425         }
1426         catch(exception& e) {
1427                 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1428                 exit(1);
1429         }
1430 }
1431 //**********************************************************************************************************************
1432
1433 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1434         try{
1435
1436 #ifdef USE_MPI
1437                 int pid;
1438                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1439
1440                 if (pid != 0) {
1441 #endif
1442
1443                 string thisOutputDir = outputDir;
1444                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1445         map<string, string> variables;
1446         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
1447                 string outputFileName = getOutputFileName("column", variables);
1448                 m->mothurRemove(outputFileName);
1449
1450
1451                 for (int i = 0; i < distNames.size(); i++) {
1452                         if (m->control_pressed) {  return 0; }
1453
1454                         string thisDistFile = distNames[i].begin()->first;
1455
1456                         m->appendFiles(thisDistFile, outputFileName);
1457                 }
1458
1459                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1460
1461 #ifdef USE_MPI
1462                 }
1463 #endif
1464
1465                 return 0;
1466
1467
1468         }
1469         catch(exception& e) {
1470                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1471                 exit(1);
1472         }
1473 }
1474 //**********************************************************************************************************************
1475 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1476     try {
1477         rabund->setLabel(list->getLabel());
1478         for(int i = 0; i < list->getNumBins(); i++) {
1479             if (m->control_pressed) { break; }
1480             vector<string> binNames;
1481             string bin = list->get(i);
1482             m->splitAtComma(bin, binNames);
1483             int total = 0;
1484             for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
1485             rabund->push_back(total);
1486         }
1487         return 0;
1488     }
1489     catch(exception& e) {
1490                 m->errorOut(e, "ClusterCommand", "createRabund");
1491                 exit(1);
1492         }
1493
1494 }
1495 //**********************************************************************************************************************