clustersplitcommand.cpp

   1 /*
   2  *  clustersplitcommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 5/19/10.
   6  *  Copyright 2010 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "clustersplitcommand.h"
  11
  12
  13 //**********************************************************************************************************************
  14 vector<string> ClusterSplitCommand::setParameters(){
  15         try {
  16                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
  17                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
  18                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
  19                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
  20         CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
  21                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
  22                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
  23                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "","",false,false,true); parameters.push_back(psplitmethod);
  24                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
  25                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
  26         CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
  27                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
  28                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
  29                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff);
  30                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
  31                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false); parameters.push_back(pmethod);
  32                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
  33         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
  34                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
  35                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
  36
  37                 vector<string> myArray;
  38                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
  39                 return myArray;
  40         }
  41         catch(exception& e) {
  42                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
  43                 exit(1);
  44         }
  45 }
  46 //**********************************************************************************************************************
  47 string ClusterSplitCommand::getHelpString(){
  48         try {
  49                 string helpString = "";
  50                 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
  51                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
  52                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
  53                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
  54                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
  55                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
  56                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
  57                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
  58                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
  59                 helpString += "The name parameter allows you to enter your name file. \n";
  60         helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
  61         helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
  62                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
  63                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
  64                 helpString += "The method allows you to specify what clustering algorithm you want to use, default=average, option furthest, nearest, or average. \n";
  65                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
  66                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
  67                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
  68                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
  69         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic.  It is only valid with splitmethod=fasta. Default=f.\n";
  70 #ifdef USE_MPI
  71                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
  72 #endif
  73                 helpString += "The cluster.split command should be in the following format: \n";
  74                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
  75                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
  76                 return helpString;
  77         }
  78         catch(exception& e) {
  79                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
  80                 exit(1);
  81         }
  82 }
  83 //**********************************************************************************************************************
  84 string ClusterSplitCommand::getOutputPattern(string type) {
  85     try {
  86         string pattern = "";
  87
  88         if (type == "list") {  pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
  89         else if (type == "rabund") {  pattern = "[filename],[clustertag],rabund"; }
  90         else if (type == "sabund") {  pattern = "[filename],[clustertag],sabund"; }
  91         else if (type == "column") {  pattern = "[filename],dist"; }
  92         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
  93
  94         return pattern;
  95     }
  96     catch(exception& e) {
  97         m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
  98         exit(1);
  99     }
 100 }
 101 //**********************************************************************************************************************
 102 ClusterSplitCommand::ClusterSplitCommand(){
 103         try {
 104                 abort = true; calledHelp = true;
 105                 setParameters();
 106                 vector<string> tempOutNames;
 107                 outputTypes["list"] = tempOutNames;
 108                 outputTypes["rabund"] = tempOutNames;
 109                 outputTypes["sabund"] = tempOutNames;
 110                 outputTypes["column"] = tempOutNames;
 111         }
 112         catch(exception& e) {
 113                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 114                 exit(1);
 115         }
 116 }
 117 //**********************************************************************************************************************
 118 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
 119 ClusterSplitCommand::ClusterSplitCommand(string option)  {
 120         try{
 121                 abort = false; calledHelp = false;
 122                 format = "";
 123
 124                 //allow user to run help
 125                 if(option == "help") { help(); abort = true; calledHelp = true; }
 126                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
 127
 128                 else {
 129                         vector<string> myArray = setParameters();
 130
 131                         OptionParser parser(option);
 132                         map<string,string> parameters = parser.getParameters();
 133
 134                         ValidParameters validParameter("cluster.split");
 135
 136                         //check to make sure all parameters are valid for command
 137                         map<string,string>::iterator it;
 138                         for (it = parameters.begin(); it != parameters.end(); it++) {
 139                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
 140                                         abort = true;
 141                                 }
 142                         }
 143
 144                         //initialize outputTypes
 145                         vector<string> tempOutNames;
 146                         outputTypes["list"] = tempOutNames;
 147                         outputTypes["rabund"] = tempOutNames;
 148                         outputTypes["sabund"] = tempOutNames;
 149                         outputTypes["column"] = tempOutNames;
 150
 151                         //if the user changes the output directory command factory will send this info to us in the output parameter
 152                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
 153
 154                                 //if the user changes the input directory command factory will send this info to us in the output parameter
 155                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
 156                         if (inputDir == "not found"){   inputDir = "";          }
 157                         else {
 158                                 string path;
 159                                 it = parameters.find("phylip");
 160                                 //user has given a template file
 161                                 if(it != parameters.end()){
 162                                         path = m->hasPath(it->second);
 163                                         //if the user has not given a path then, add inputdir. else leave path alone.
 164                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
 165                                 }
 166
 167                                 it = parameters.find("column");
 168                                 //user has given a template file
 169                                 if(it != parameters.end()){
 170                                         path = m->hasPath(it->second);
 171                                         //if the user has not given a path then, add inputdir. else leave path alone.
 172                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
 173                                 }
 174
 175                                 it = parameters.find("name");
 176                                 //user has given a template file
 177                                 if(it != parameters.end()){
 178                                         path = m->hasPath(it->second);
 179                                         //if the user has not given a path then, add inputdir. else leave path alone.
 180                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
 181                                 }
 182
 183                                 it = parameters.find("taxonomy");
 184                                 //user has given a template file
 185                                 if(it != parameters.end()){
 186                                         path = m->hasPath(it->second);
 187                                         //if the user has not given a path then, add inputdir. else leave path alone.
 188                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
 189                                 }
 190
 191                                 it = parameters.find("fasta");
 192                                 //user has given a template file
 193                                 if(it != parameters.end()){
 194                                         path = m->hasPath(it->second);
 195                                         //if the user has not given a path then, add inputdir. else leave path alone.
 196                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
 197                                 }
 198
 199                 it = parameters.find("count");
 200                                 //user has given a template file
 201                                 if(it != parameters.end()){
 202                                         path = m->hasPath(it->second);
 203                                         //if the user has not given a path then, add inputdir. else leave path alone.
 204                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
 205                                 }
 206                         }
 207
 208                         //check for required parameters
 209                         phylipfile = validParameter.validFile(parameters, "phylip", true);
 210                         if (phylipfile == "not open") { abort = true; }
 211                         else if (phylipfile == "not found") { phylipfile = ""; }
 212                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
 213
 214                         columnfile = validParameter.validFile(parameters, "column", true);
 215                         if (columnfile == "not open") { abort = true; }
 216                         else if (columnfile == "not found") { columnfile = ""; }
 217                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
 218
 219                         namefile = validParameter.validFile(parameters, "name", true);
 220                         if (namefile == "not open") { abort = true; namefile = "";}
 221                         else if (namefile == "not found") { namefile = "";  }
 222                         else { m->setNameFile(namefile); }
 223
 224             countfile = validParameter.validFile(parameters, "count", true);
 225                         if (countfile == "not open") { abort = true; countfile = "";}
 226                         else if (countfile == "not found") { countfile = "";  }
 227                         else { m->setCountTableFile(countfile); }
 228
 229                         fastafile = validParameter.validFile(parameters, "fasta", true);
 230                         if (fastafile == "not open") { abort = true; }
 231                         else if (fastafile == "not found") { fastafile = ""; }
 232                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
 233
 234                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
 235                         if (taxFile == "not open") { taxFile = ""; abort = true; }
 236                         else if (taxFile == "not found") { taxFile = ""; }
 237                         else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
 238
 239                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
 240                                 //is there are current file available for either of these?
 241                                 //give priority to column, then phylip, then fasta
 242                                 columnfile = m->getColumnFile();
 243                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
 244                                 else {
 245                                         phylipfile = m->getPhylipFile();
 246                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
 247                                         else {
 248                                                 fastafile = m->getFastaFile();
 249                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
 250                                                 else {
 251                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
 252                                                         abort = true;
 253                                                 }
 254                                         }
 255                                 }
 256                         }
 257                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
 258
 259             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
 260
 261                         if (columnfile != "") {
 262                                 if ((namefile == "") && (countfile == "")) {
 263                                         namefile = m->getNameFile();
 264                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 265                                         else {
 266                                                 countfile = m->getCountTableFile();
 267                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
 268                         else {
 269                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
 270                             abort = true;
 271                         }
 272                                         }
 273                                 }
 274                         }
 275
 276                         if (fastafile != "") {
 277                                 if (taxFile == "") {
 278                                         taxFile = m->getTaxonomyFile();
 279                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
 280                                         else {
 281                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
 282                                                 abort = true;
 283                                         }
 284                                 }
 285
 286                                 if ((namefile == "") && (countfile == "")) {
 287                                         namefile = m->getNameFile();
 288                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 289                                         else {
 290                                                 countfile = m->getCountTableFile();
 291                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
 292                         else {
 293                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine();
 294                             abort = true;
 295                         }
 296                                         }
 297                                 }
 298                         }
 299
 300                         //check for optional parameter and set defaults
 301                         // ...at some point should added some additional type checking...
 302                         //get user cutoff and precision or use defaults
 303                         string temp;
 304                         temp = validParameter.validFile(parameters, "precision", false);
 305                         if (temp == "not found") { temp = "100"; }
 306                         //saves precision legnth for formatting below
 307                         length = temp.length();
 308                         m->mothurConvert(temp, precision);
 309
 310                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
 311                         hard = m->isTrue(temp);
 312
 313                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
 314                         large = m->isTrue(temp);
 315
 316                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
 317                         m->setProcessors(temp);
 318                         m->mothurConvert(temp, processors);
 319
 320                         temp = validParameter.validFile(parameters, "splitmethod", false);
 321                         if ((splitmethod != "fasta") && (splitmethod != "classify")) {
 322                                 if (temp == "not found")  { splitmethod = "distance"; }
 323                                 else {  splitmethod = temp; }
 324                         }
 325
 326             temp = validParameter.validFile(parameters, "classic", false);                      if (temp == "not found") { temp = "F"; }
 327                         classic = m->isTrue(temp);
 328
 329             if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
 330
 331                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
 332                         m->mothurConvert(temp, cutoff);
 333                         cutoff += (5 / (precision * 10.0));
 334
 335                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
 336                         m->mothurConvert(temp, taxLevelCutoff);
 337
 338                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
 339
 340                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
 341                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 342
 343                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
 344                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
 345
 346                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
 347
 348                         showabund = validParameter.validFile(parameters, "showabund", false);
 349                         if (showabund == "not found") { showabund = "T"; }
 350
 351             temp = validParameter.validFile(parameters, "cluster", false);  if (temp == "not found") { temp = "T"; }
 352             runCluster = m->isTrue(temp);
 353
 354                         timing = validParameter.validFile(parameters, "timing", false);
 355                         if (timing == "not found") { timing = "F"; }
 356
 357                 }
 358         }
 359         catch(exception& e) {
 360                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 361                 exit(1);
 362         }
 363 }
 364
 365 //**********************************************************************************************************************
 366
 367 int ClusterSplitCommand::execute(){
 368         try {
 369
 370                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
 371
 372                 time_t estart;
 373                 vector<string> listFileNames;
 374                 set<string> labels;
 375                 string singletonName = "";
 376                 double saveCutoff = cutoff;
 377
 378                 //****************** file prep work ******************************//
 379                 #ifdef USE_MPI
 380                         int pid;
 381                         int tag = 2001;
 382                         MPI_Status status;
 383                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
 384                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 385
 386                         if (pid == 0) { //only process 0 converts and splits
 387
 388                 #endif
 389
 390                 //if user gave a phylip file convert to column file
 391                 if (format == "phylip") {
 392                         estart = time(NULL);
 393                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
 394
 395                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
 396
 397                         NameAssignment* nameMap = NULL;
 398                         convert->setFormat("phylip");
 399                         convert->read(nameMap);
 400
 401                         if (m->control_pressed) {  delete convert;  return 0;  }
 402
 403                         distfile = convert->getOutputFile();
 404
 405                         //if no names file given with phylip file, create it
 406                         ListVector* listToMakeNameFile =  convert->getListVector();
 407                         if ((namefile == "") && (countfile == "")) {  //you need to make a namefile for split matrix
 408                                 ofstream out;
 409                                 namefile = phylipfile + ".names";
 410                                 m->openOutputFile(namefile, out);
 411                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
 412                                         string bin = listToMakeNameFile->get(i);
 413                                         out << bin << '\t' << bin << endl;
 414                                 }
 415                                 out.close();
 416                         }
 417                         delete listToMakeNameFile;
 418                         delete convert;
 419
 420                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
 421                 }
 422                 if (m->control_pressed) { return 0; }
 423
 424                 estart = time(NULL);
 425                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
 426
 427                 //split matrix into non-overlapping groups
 428                 SplitMatrix* split;
 429                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large);                                                    }
 430                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large);                                    }
 431                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);  }
 432                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
 433
 434                 split->split();
 435
 436                 if (m->control_pressed) { delete split; return 0; }
 437
 438                 singletonName = split->getSingletonNames();
 439                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
 440                 delete split;
 441
 442         if (m->debug) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); }
 443
 444                 //output a merged distance file
 445                 //if (splitmethod == "fasta")           { createMergedDistanceFile(distName); }
 446
 447                 if (m->control_pressed) { return 0; }
 448
 449                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
 450                 estart = time(NULL);
 451
 452         if (!runCluster) {
 453
 454                 m->mothurOutEndLine();
 455                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 456                 for (int i = 0; i < distName.size(); i++) {     m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine();      }
 457                 m->mothurOutEndLine();
 458                 return 0;
 459
 460         }
 461
 462                 //****************** break up files between processes and cluster each file set ******************************//
 463         #ifdef USE_MPI
 464                         ////you are process 0 from above////
 465
 466                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 467                         dividedNames.resize(processors);
 468
 469                         //for each file group figure out which process will complete it
 470                         //want to divide the load intelligently so the big files are spread between processes
 471                         for (int i = 0; i < distName.size(); i++) {
 472                                 int processToAssign = (i+1) % processors;
 473                                 if (processToAssign == 0) { processToAssign = processors; }
 474
 475                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 476                         }
 477
 478                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 479                         for (int i = 0; i < processors; i++) {
 480                                 int remainder = ((i+1) % processors);
 481                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 482                         }
 483
 484
 485                         //send each child the list of files it needs to process
 486                         for(int i = 1; i < processors; i++) {
 487                                 //send number of file pairs
 488                                 int num = dividedNames[i].size();
 489                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 490
 491                                 for (int j = 0; j < num; j++) { //send filenames to process i
 492                                         char tempDistFileName[1024];
 493                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
 494                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
 495
 496                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 497                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 498
 499                                         char tempNameFileName[1024];
 500                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
 501                                         int lengthName = (dividedNames[i][j].begin()->second).length();
 502
 503                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 504                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 505                                 }
 506                         }
 507
 508                         //process your share
 509                         listFileNames = cluster(dividedNames[0], labels);
 510
 511                         //receive the other processes info
 512                         for(int i = 1; i < processors; i++) {
 513                                 int num = dividedNames[i].size();
 514
 515                                 double tempCutoff;
 516                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
 517                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
 518
 519                                 //send list filenames to root process
 520                                 for (int j = 0; j < num; j++) {
 521                                         int lengthList = 0;
 522                                         char tempListFileName[1024];
 523
 524                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 525                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 526
 527                                         string myListFileName = tempListFileName;
 528                                         myListFileName = myListFileName.substr(0, lengthList);
 529
 530                                         listFileNames.push_back(myListFileName);
 531                                 }
 532
 533                                 //send Labels to root process
 534                                 int numLabels = 0;
 535                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 536
 537                                 for (int j = 0; j < numLabels; j++) {
 538                                         int lengthLabel = 0;
 539                                         char tempLabel[100];
 540
 541                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 542                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 543
 544                                         string myLabel = tempLabel;
 545                                         myLabel = myLabel.substr(0, lengthLabel);
 546
 547                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
 548                                 }
 549                         }
 550
 551                 }else { //you are a child process
 552                         vector < map<string, string> >  myNames;
 553
 554                         //recieve the files you need to process
 555                         //receive number of file pairs
 556                         int num = 0;
 557                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 558
 559                         myNames.resize(num);
 560
 561                         for (int j = 0; j < num; j++) { //receive filenames to process
 562                                 int lengthDist = 0;
 563                                 char tempDistFileName[1024];
 564
 565                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 566                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 567
 568                                 string myDistFileName = tempDistFileName;
 569                                 myDistFileName = myDistFileName.substr(0, lengthDist);
 570
 571                                 int lengthName = 0;
 572                                 char tempNameFileName[1024];
 573
 574                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 575                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 576
 577                                 string myNameFileName = tempNameFileName;
 578                                 myNameFileName = myNameFileName.substr(0, lengthName);
 579
 580                                 //save file name
 581                                 myNames[j][myDistFileName] = myNameFileName;
 582                         }
 583
 584                         //process them
 585                         listFileNames = cluster(myNames, labels);
 586
 587                         //send cutoff
 588                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
 589
 590                         //send list filenames to root process
 591                         for (int j = 0; j < num; j++) {
 592                                 char tempListFileName[1024];
 593                                 strcpy(tempListFileName, listFileNames[j].c_str());
 594                                 int lengthList = listFileNames[j].length();
 595
 596                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 597                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 598                         }
 599
 600                         //send Labels to root process
 601                         int numLabels = labels.size();
 602                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 603
 604                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
 605                                 char tempLabel[100];
 606                                 strcpy(tempLabel, (*it).c_str());
 607                                 int lengthLabel = (*it).length();
 608
 609                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 610                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 611                         }
 612                 }
 613
 614                 //make everyone wait
 615                 MPI_Barrier(MPI_COMM_WORLD);
 616
 617         #else
 618                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
 619                 //sanity check
 620                 if (processors > distName.size()) { processors = distName.size(); }
 621
 622                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 623                                 if(processors == 1){
 624                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 625                                 }else{
 626                                         listFileNames = createProcesses(distName, labels);
 627                 }
 628                 #else
 629                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 630                 #endif
 631         #endif
 632                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
 633
 634                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
 635
 636                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
 637
 638                 //****************** merge list file and create rabund and sabund files ******************************//
 639                 estart = time(NULL);
 640                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
 641
 642                 #ifdef USE_MPI
 643                         if (pid == 0) { //only process 0 merges
 644                 #endif
 645
 646                 ListVector* listSingle;
 647                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 648
 649                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 650
 651                 mergeLists(listFileNames, labelBins, listSingle);
 652
 653                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 654
 655                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 656
 657                 //set list file as new current listfile
 658                 string current = "";
 659                 itTypes = outputTypes.find("list");
 660                 if (itTypes != outputTypes.end()) {
 661                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
 662                 }
 663
 664                 //set rabund file as new current rabundfile
 665                 itTypes = outputTypes.find("rabund");
 666                 if (itTypes != outputTypes.end()) {
 667                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
 668                 }
 669
 670                 //set sabund file as new current sabundfile
 671                 itTypes = outputTypes.find("sabund");
 672                 if (itTypes != outputTypes.end()) {
 673                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
 674                 }
 675
 676                 m->mothurOutEndLine();
 677                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 678                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
 679                 m->mothurOutEndLine();
 680
 681                 #ifdef USE_MPI
 682                         } //only process 0 merges
 683
 684                         //make everyone wait
 685                         MPI_Barrier(MPI_COMM_WORLD);
 686                 #endif
 687
 688                 return 0;
 689         }
 690         catch(exception& e) {
 691                 m->errorOut(e, "ClusterSplitCommand", "execute");
 692                 exit(1);
 693         }
 694 }
 695 //**********************************************************************************************************************
 696 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
 697         try {
 698
 699                 map<float, int> labelBin;
 700                 vector<float> orderFloat;
 701                 int numSingleBins;
 702
 703                 //read in singletons
 704                 if (singleton != "none") {
 705
 706             ifstream in;
 707             m->openInputFile(singleton, in);
 708
 709                         string firstCol, secondCol;
 710                         listSingle = new ListVector();
 711
 712             if (countfile != "") { m->getline(in); m->gobble(in); }
 713
 714                         while (!in.eof()) {
 715                                 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
 716                                 if (countfile == "") { listSingle->push_back(secondCol); }
 717                 else { listSingle->push_back(firstCol); }
 718                         }
 719
 720                         in.close();
 721                         m->mothurRemove(singleton);
 722
 723                         numSingleBins = listSingle->getNumBins();
 724                 }else{  listSingle = NULL; numSingleBins = 0;  }
 725
 726                 //go through users set and make them floats so we can sort them
 727                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
 728                         float temp = -10.0;
 729
 730                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
 731                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
 732
 733                         if (temp <= cutoff) {
 734                                 orderFloat.push_back(temp);
 735                                 labelBin[temp] = numSingleBins; //initialize numbins
 736                         }
 737                 }
 738
 739                 //sort order
 740                 sort(orderFloat.begin(), orderFloat.end());
 741                 userLabels.clear();
 742
 743                 //get the list info from each file
 744                 for (int k = 0; k < listNames.size(); k++) {
 745
 746                         if (m->control_pressed) {
 747                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
 748                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
 749                                 return labelBin;
 750                         }
 751
 752                         InputData* input = new InputData(listNames[k], "list");
 753                         ListVector* list = input->getListVector();
 754                         string lastLabel = list->getLabel();
 755
 756                         string filledInList = listNames[k] + "filledInTemp";
 757                         ofstream outFilled;
 758                         m->openOutputFile(filledInList, outFilled);
 759
 760                         //for each label needed
 761                         for(int l = 0; l < orderFloat.size(); l++){
 762
 763                                 string thisLabel;
 764                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
 765                                 else { thisLabel = toString(orderFloat[l],  length-1);  }
 766
 767                                 //this file has reached the end
 768                                 if (list == NULL) {
 769                                         list = input->getListVector(lastLabel, true);
 770                                 }else{  //do you have the distance, or do you need to fill in
 771
 772                                         float labelFloat;
 773                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
 774                                         else { convert(list->getLabel(), labelFloat); }
 775
 776                                         //check for missing labels
 777                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
 778                                                 //if its bigger get last label, otherwise keep it
 779                                                 delete list;
 780                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
 781                                         }
 782                                         lastLabel = list->getLabel();
 783                                 }
 784
 785                                 //print to new file
 786                                 list->setLabel(thisLabel);
 787                                 list->print(outFilled);
 788
 789                                 //update labelBin
 790                                 labelBin[orderFloat[l]] += list->getNumBins();
 791
 792                                 delete list;
 793
 794                                 list = input->getListVector();
 795                         }
 796
 797                         if (list != NULL) { delete list; }
 798                         delete input;
 799
 800                         outFilled.close();
 801                         m->mothurRemove(listNames[k]);
 802                         rename(filledInList.c_str(), listNames[k].c_str());
 803                 }
 804
 805                 return labelBin;
 806         }
 807         catch(exception& e) {
 808                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
 809                 exit(1);
 810         }
 811 }
 812 //**********************************************************************************************************************
 813 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
 814         try {
 815                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
 816                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
 817
 818         map<string, string> variables;
 819         variables["[filename]"] = fileroot;
 820         variables["[clustertag]"] = tag;
 821         string sabundFileName = getOutputFileName("sabund", variables);
 822         string rabundFileName = getOutputFileName("rabund", variables);
 823         if (countfile != "") { variables["[tag2]"] = "unique_list"; }
 824         string listFileName = getOutputFileName("list", variables);
 825
 826         if (countfile == "") {
 827             m->openOutputFile(sabundFileName,   outSabund);
 828             m->openOutputFile(rabundFileName,   outRabund);
 829             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
 830             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
 831
 832         }
 833                 m->openOutputFile(listFileName, outList);
 834         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
 835
 836                 map<float, int>::iterator itLabel;
 837
 838         //clears out junk for autocompleting of list files above.  Perhaps there is a beter way to handle this from within the data structure?
 839         m->printedListHeaders = false;
 840
 841                 //for each label needed
 842                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
 843
 844                         string thisLabel;
 845                         if (itLabel->first == -1) { thisLabel = "unique"; }
 846                         else { thisLabel = toString(itLabel->first,  length-1);  }
 847
 848                         //outList << thisLabel << '\t' << itLabel->second << '\t';
 849
 850             RAbundVector* rabund = NULL;
 851             ListVector completeList;
 852             completeList.setLabel(thisLabel);
 853
 854             if (countfile == "") {
 855                 rabund = new RAbundVector();
 856                 rabund->setLabel(thisLabel);
 857             }
 858
 859                         //add in singletons
 860                         if (listSingle != NULL) {
 861                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
 862                                         //outList << listSingle->get(j) << '\t';
 863                     completeList.push_back(listSingle->get(j));
 864                                         if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
 865                                 }
 866                         }
 867
 868                         //get the list info from each file
 869                         for (int k = 0; k < listNames.size(); k++) {
 870
 871                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } if (rabund != NULL) { delete rabund; } return 0; }
 872
 873                                 InputData* input = new InputData(listNames[k], "list");
 874                                 ListVector* list = input->getListVector(thisLabel);
 875
 876                                 //this file has reached the end
 877                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }
 878                                 else {
 879                                         for (int j = 0; j < list->getNumBins(); j++) {
 880                                                 //outList << list->get(j) << '\t';
 881                         completeList.push_back(list->get(j));
 882                                                 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
 883                                         }
 884                                         delete list;
 885                                 }
 886                                 delete input;
 887                         }
 888
 889             if (countfile == "") {
 890                 SAbundVector sabund = rabund->getSAbundVector();
 891                 sabund.print(outSabund);
 892                 rabund->print(outRabund);
 893             }
 894                         //outList << endl;
 895             if (!m->printedListHeaders) {
 896                 m->listBinLabelsInFile.clear(); completeList.printHeaders(outList); }
 897             completeList.print(outList);
 898
 899                         if (rabund != NULL) { delete rabund; }
 900                 }
 901
 902                 outList.close();
 903         if (countfile == "") {
 904             outRabund.close();
 905             outSabund.close();
 906                 }
 907                 if (listSingle != NULL) { delete listSingle;  }
 908
 909                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
 910
 911                 return 0;
 912         }
 913         catch(exception& e) {
 914                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
 915                 exit(1);
 916         }
 917 }
 918
 919 //**********************************************************************************************************************
 920
 921 void ClusterSplitCommand::printData(ListVector* oldList){
 922         try {
 923                 string label = oldList->getLabel();
 924                 RAbundVector oldRAbund = oldList->getRAbundVector();
 925
 926                 oldRAbund.setLabel(label);
 927                 if (m->isTrue(showabund)) {
 928                         oldRAbund.getSAbundVector().print(cout);
 929                 }
 930                 oldRAbund.print(outRabund);
 931                 oldRAbund.getSAbundVector().print(outSabund);
 932
 933                 oldList->print(outList);
 934         }
 935         catch(exception& e) {
 936                 m->errorOut(e, "ClusterSplitCommand", "printData");
 937                 exit(1);
 938         }
 939 }
 940 //**********************************************************************************************************************
 941 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
 942         try {
 943
 944         vector<string> listFiles;
 945         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 946         dividedNames.resize(processors);
 947
 948         //for each file group figure out which process will complete it
 949         //want to divide the load intelligently so the big files are spread between processes
 950         for (int i = 0; i < distName.size(); i++) {
 951             //cout << i << endl;
 952             int processToAssign = (i+1) % processors;
 953             if (processToAssign == 0) { processToAssign = processors; }
 954
 955             dividedNames[(processToAssign-1)].push_back(distName[i]);
 956             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
 957         }
 958
 959         //now lets reverse the order of ever other process, so we balance big files running with little ones
 960         for (int i = 0; i < processors; i++) {
 961             //cout << i << endl;
 962             int remainder = ((i+1) % processors);
 963             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 964         }
 965
 966         if (m->control_pressed) { return listFiles; }
 967
 968         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 969                 int process = 1;
 970                 processIDS.clear();
 971
 972                 //loop through and create all the processes you want
 973                 while (process != processors) {
 974                         int pid = fork();
 975
 976                         if (pid > 0) {
 977                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 978                                 process++;
 979                         }else if (pid == 0){
 980                                 set<string> labels;
 981                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
 982
 983                                 //write out names to file
 984                                 string filename = toString(getpid()) + ".temp";
 985                                 ofstream out;
 986                                 m->openOutputFile(filename, out);
 987                                 out << tag << endl;
 988                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
 989                                 out.close();
 990
 991                                 //print out labels
 992                                 ofstream outLabels;
 993                                 filename = toString(getpid()) + ".temp.labels";
 994                                 m->openOutputFile(filename, outLabels);
 995
 996                                 outLabels << cutoff << endl;
 997                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
 998                                         outLabels << (*it) << endl;
 999                                 }
1000                                 outLabels.close();
1001
1002                                 exit(0);
1003                         }else {
1004                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1005                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1006                                 exit(0);
1007                         }
1008                 }
1009
1010         //do your part
1011         listFiles = cluster(dividedNames[0], labels);
1012
1013                 //force parent to wait until all the processes are done
1014                 for (int i=0;i< processIDS.size();i++) {
1015                         int temp = processIDS[i];
1016                         wait(&temp);
1017                 }
1018
1019         //get list of list file names from each process
1020         for(int i=0;i<processIDS.size();i++){
1021             string filename = toString(processIDS[i]) + ".temp";
1022             ifstream in;
1023             m->openInputFile(filename, in);
1024
1025             in >> tag; m->gobble(in);
1026
1027             while(!in.eof()) {
1028                 string tempName;
1029                 in >> tempName; m->gobble(in);
1030                 listFiles.push_back(tempName);
1031             }
1032             in.close();
1033             m->mothurRemove((toString(processIDS[i]) + ".temp"));
1034
1035             //get labels
1036             filename = toString(processIDS[i]) + ".temp.labels";
1037             ifstream in2;
1038             m->openInputFile(filename, in2);
1039
1040             float tempCutoff;
1041             in2 >> tempCutoff; m->gobble(in2);
1042             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1043
1044             while(!in2.eof()) {
1045                 string tempName;
1046                 in2 >> tempName; m->gobble(in2);
1047                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1048             }
1049             in2.close();
1050             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1051         }
1052
1053
1054     #else
1055
1056         //////////////////////////////////////////////////////////////////////////////////////////////////////
1057                 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
1058                 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1059                 //Taking advantage of shared memory to allow both threads to add labels.
1060                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1061                 /*
1062                 vector<clusterData*> pDataArray;
1063                 DWORD   dwThreadIdArray[processors-1];
1064                 HANDLE  hThreadArray[processors-1];
1065
1066                 //Create processor worker threads.
1067                 for( int i=1; i<processors; i++ ){
1068                         // Allocate memory for thread data.
1069                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1070                         pDataArray.push_back(tempCluster);
1071                         processIDS.push_back(i);
1072
1073                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1074                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1075                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
1076
1077                 }
1078
1079         //do your part
1080         listFiles = cluster(dividedNames[0], labels);
1081
1082                 //Wait until all threads have terminated.
1083                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1084
1085                 //Close all thread handles and free memory allocations.
1086                 for(int i=0; i < pDataArray.size(); i++){
1087             //get tag
1088             tag = pDataArray[i]->tag;
1089             //get listfiles created
1090             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1091             //get labels
1092             set<string>::iterator it;
1093             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1094                         //check cutoff
1095             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1096                         CloseHandle(hThreadArray[i]);
1097                         delete pDataArray[i];
1098                 }
1099 */
1100         #endif
1101
1102         return listFiles;
1103
1104         }
1105         catch(exception& e) {
1106                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1107                 exit(1);
1108         }
1109 }
1110 //**********************************************************************************************************************
1111
1112 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1113         try {
1114
1115                 vector<string> listFileNames;
1116                 double smallestCutoff = cutoff;
1117
1118                 //cluster each distance file
1119                 for (int i = 0; i < distNames.size(); i++) {
1120
1121                         string thisNamefile = distNames[i].begin()->second;
1122                         string thisDistFile = distNames[i].begin()->first;
1123
1124                         string listFileName = "";
1125             if (classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff);   }
1126             else            {  listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff);          }
1127
1128                         if (m->control_pressed) { //clean up
1129                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1130                                 listFileNames.clear(); return listFileNames;
1131                         }
1132
1133             listFileNames.push_back(listFileName);
1134         }
1135
1136                 cutoff = smallestCutoff;
1137
1138                 return listFileNames;
1139
1140         }
1141         catch(exception& e) {
1142                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1143                 exit(1);
1144         }
1145
1146
1147 }
1148 //**********************************************************************************************************************
1149 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1150         try {
1151         string listFileName = "";
1152
1153         ListVector* list = NULL;
1154         ListVector oldList;
1155         RAbundVector* rabund = NULL;
1156
1157 #ifdef USE_MPI
1158         int pid;
1159         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1160
1161         //output your files too
1162         if (pid != 0) {
1163             cout << endl << "Reading " << thisDistFile << endl;
1164         }
1165 #endif
1166
1167         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1168
1169         //reads phylip file storing data in 2D vector, also fills list and rabund
1170         bool sim = false;
1171                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1172
1173         NameAssignment* nameMap = NULL;
1174         CountTable* ct = NULL;
1175         if(namefile != ""){
1176                         nameMap = new NameAssignment(thisNamefile);
1177                         nameMap->readMap();
1178             cluster->readPhylipFile(thisDistFile, nameMap);
1179                 }else if (countfile != "") {
1180             ct = new CountTable();
1181             ct->readTable(thisNamefile, false, false);
1182             cluster->readPhylipFile(thisDistFile, ct);
1183         }
1184         tag = cluster->getTag();
1185
1186                 if (m->control_pressed) { if(namefile != ""){   delete nameMap; }
1187             else { delete ct; } delete cluster; return 0; }
1188
1189                 list = cluster->getListVector();
1190                 rabund = cluster->getRAbundVector();
1191
1192                 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1193                 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1194         listFileName = fileroot+ tag + ".list";
1195
1196         ofstream listFile;
1197                 m->openOutputFile(fileroot+ tag + ".list",      listFile);
1198
1199                 float previousDist = 0.00000;
1200                 float rndPreviousDist = 0.00000;
1201                 oldList = *list;
1202
1203 #ifdef USE_MPI
1204         //output your files too
1205         if (pid != 0) {
1206             cout << endl << "Clustering " << thisDistFile << endl;
1207         }
1208 #endif
1209
1210         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1211
1212                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1213                         if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  if(namefile != ""){    delete nameMap; }
1214                 else { delete ct; } return listFileName;  }
1215
1216                         cluster->update(cutoff);
1217
1218                         float dist = cluster->getSmallDist();
1219                         float rndDist;
1220                         if (hard) {
1221                                 rndDist = m->ceilDist(dist, precision);
1222                         }else{
1223                                 rndDist = m->roundDist(dist, precision);
1224                         }
1225
1226             if(previousDist <= 0.0000 && dist != previousDist){
1227                 oldList.setLabel("unique");
1228                 oldList.print(listFile);
1229                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1230             }
1231             else if(rndDist != rndPreviousDist){
1232                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1233                 oldList.print(listFile);
1234                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1235             }
1236
1237
1238                         previousDist = dist;
1239                         rndPreviousDist = rndDist;
1240                         oldList = *list;
1241                 }
1242
1243                 if(previousDist <= 0.0000){
1244             oldList.setLabel("unique");
1245             oldList.print(listFile);
1246             if (labels.count("unique") == 0) { labels.insert("unique"); }
1247         }
1248         else if(rndPreviousDist<cutoff){
1249             oldList.setLabel(toString(rndPreviousDist,  length-1));
1250             oldList.print(listFile);
1251             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1252         }
1253
1254
1255                 listFile.close();
1256
1257                 delete cluster;  delete list; delete rabund;
1258         if(namefile != ""){     delete nameMap; }
1259         else { delete ct; }
1260
1261         m->mothurRemove(thisDistFile);
1262         m->mothurRemove(thisNamefile);
1263
1264         return listFileName;
1265
1266         }
1267         catch(exception& e) {
1268                 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1269                 exit(1);
1270         }
1271 }
1272
1273 //**********************************************************************************************************************
1274 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1275         try {
1276         string listFileName = "";
1277
1278         Cluster* cluster = NULL;
1279         SparseDistanceMatrix* matrix = NULL;
1280         ListVector* list = NULL;
1281         ListVector oldList;
1282         RAbundVector* rabund = NULL;
1283
1284         if (m->control_pressed) { return listFileName; }
1285
1286 #ifdef USE_MPI
1287         int pid;
1288         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1289
1290         //output your files too
1291         if (pid != 0) {
1292             cout << endl << "Reading " << thisDistFile << endl;
1293         }
1294 #endif
1295
1296         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1297
1298         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1299         read->setCutoff(cutoff);
1300
1301         NameAssignment* nameMap = NULL;
1302         CountTable* ct = NULL;
1303                 if(namefile != ""){
1304                         nameMap = new NameAssignment(thisNamefile);
1305                         nameMap->readMap();
1306             read->read(nameMap);
1307                 }else if (countfile != "") {
1308             ct = new CountTable();
1309             ct->readTable(thisNamefile, false, false);
1310             read->read(ct);
1311         }else { read->read(nameMap); }
1312
1313                 list = read->getListVector();
1314         oldList = *list;
1315                 matrix = read->getDMatrix();
1316
1317                 if(countfile != "") {
1318             rabund = new RAbundVector();
1319             createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1320             delete ct;
1321         }else { rabund = new RAbundVector(list->getRAbundVector()); }
1322
1323         delete read;  read = NULL;
1324         if (namefile != "") { delete nameMap; nameMap = NULL; }
1325
1326
1327 #ifdef USE_MPI
1328         //output your files too
1329         if (pid != 0) {
1330             cout << endl << "Clustering " << thisDistFile << endl;
1331         }
1332 #endif
1333
1334         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1335
1336         //create cluster
1337         float adjust = -1.0;
1338         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method, adjust); }
1339         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method, adjust); }
1340         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method, adjust);     }
1341         tag = cluster->getTag();
1342
1343         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1344         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1345
1346         ofstream listFile;
1347         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1348
1349         listFileName = fileroot+ tag + ".list";
1350
1351         float previousDist = 0.00000;
1352         float rndPreviousDist = 0.00000;
1353
1354         oldList = *list;
1355
1356         print_start = true;
1357         start = time(NULL);
1358         double saveCutoff = cutoff;
1359
1360         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1361
1362             if (m->control_pressed) { //clean up
1363                 delete matrix; delete list;     delete cluster; delete rabund;
1364                 listFile.close();
1365                 m->mothurRemove(listFileName);
1366                 return listFileName;
1367             }
1368
1369             cluster->update(saveCutoff);
1370
1371             float dist = matrix->getSmallDist();
1372             float rndDist;
1373             if (hard) {
1374                 rndDist = m->ceilDist(dist, precision);
1375             }else{
1376                 rndDist = m->roundDist(dist, precision);
1377             }
1378
1379             if(previousDist <= 0.0000 && dist != previousDist){
1380                 oldList.setLabel("unique");
1381                 oldList.print(listFile);
1382                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1383             }
1384             else if(rndDist != rndPreviousDist){
1385                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1386                 oldList.print(listFile);
1387                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1388             }
1389
1390             previousDist = dist;
1391             rndPreviousDist = rndDist;
1392             oldList = *list;
1393         }
1394
1395
1396         if(previousDist <= 0.0000){
1397             oldList.setLabel("unique");
1398             oldList.print(listFile);
1399             if (labels.count("unique") == 0) { labels.insert("unique"); }
1400         }
1401         else if(rndPreviousDist<cutoff){
1402             oldList.setLabel(toString(rndPreviousDist,  length-1));
1403             oldList.print(listFile);
1404             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1405         }
1406
1407         delete matrix; delete list;     delete cluster; delete rabund;
1408         matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1409         listFile.close();
1410
1411         if (m->control_pressed) { //clean up
1412             m->mothurRemove(listFileName);
1413             return listFileName;
1414         }
1415
1416         m->mothurRemove(thisDistFile);
1417         m->mothurRemove(thisNamefile);
1418
1419         if (saveCutoff != cutoff) {
1420             if (hard)   {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1421             else                {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1422
1423             m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1424         }
1425
1426         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1427
1428         return listFileName;
1429
1430         }
1431         catch(exception& e) {
1432                 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1433                 exit(1);
1434         }
1435 }
1436 //**********************************************************************************************************************
1437
1438 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1439         try{
1440
1441 #ifdef USE_MPI
1442                 int pid;
1443                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1444
1445                 if (pid != 0) {
1446 #endif
1447
1448                 string thisOutputDir = outputDir;
1449                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1450         map<string, string> variables;
1451         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
1452                 string outputFileName = getOutputFileName("column", variables);
1453                 m->mothurRemove(outputFileName);
1454
1455
1456                 for (int i = 0; i < distNames.size(); i++) {
1457                         if (m->control_pressed) {  return 0; }
1458
1459                         string thisDistFile = distNames[i].begin()->first;
1460
1461                         m->appendFiles(thisDistFile, outputFileName);
1462                 }
1463
1464                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1465
1466 #ifdef USE_MPI
1467                 }
1468 #endif
1469
1470                 return 0;
1471
1472
1473         }
1474         catch(exception& e) {
1475                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1476                 exit(1);
1477         }
1478 }
1479 //**********************************************************************************************************************
1480 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1481     try {
1482         rabund->setLabel(list->getLabel());
1483         for(int i = 0; i < list->getNumBins(); i++) {
1484             if (m->control_pressed) { break; }
1485             vector<string> binNames;
1486             string bin = list->get(i);
1487             m->splitAtComma(bin, binNames);
1488             int total = 0;
1489             for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
1490             rabund->push_back(total);
1491         }
1492         return 0;
1493     }
1494     catch(exception& e) {
1495                 m->errorOut(e, "ClusterCommand", "createRabund");
1496                 exit(1);
1497         }
1498
1499 }
1500 //**********************************************************************************************************************