clustersplitcommand.cpp

   1 /*
   2  *  clustersplitcommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 5/19/10.
   6  *  Copyright 2010 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "clustersplitcommand.h"
  11
  12
  13 //**********************************************************************************************************************
  14 vector<string> ClusterSplitCommand::setParameters(){
  15         try {
  16                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
  17                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
  18                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
  19                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
  20         CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
  21                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
  22                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
  23                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "","",false,false,true); parameters.push_back(psplitmethod);
  24                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
  25                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
  26         CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
  27                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
  28                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
  29                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff);
  30                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
  31                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false); parameters.push_back(pmethod);
  32                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
  33         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
  34                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
  35                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
  36
  37                 vector<string> myArray;
  38                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
  39                 return myArray;
  40         }
  41         catch(exception& e) {
  42                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
  43                 exit(1);
  44         }
  45 }
  46 //**********************************************************************************************************************
  47 string ClusterSplitCommand::getHelpString(){
  48         try {
  49                 string helpString = "";
  50                 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
  51                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
  52                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
  53                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
  54                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
  55                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
  56                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
  57                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
  58                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
  59                 helpString += "The name parameter allows you to enter your name file. \n";
  60         helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
  61         helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
  62                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
  63                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
  64                 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
  65                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
  66                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
  67                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
  68                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
  69         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic.  It is only valid with splitmethod=fasta. Default=f.\n";
  70 #ifdef USE_MPI
  71                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
  72 #endif
  73                 helpString += "The cluster.split command should be in the following format: \n";
  74                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
  75                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
  76                 return helpString;
  77         }
  78         catch(exception& e) {
  79                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
  80                 exit(1);
  81         }
  82 }
  83 //**********************************************************************************************************************
  84 string ClusterSplitCommand::getOutputPattern(string type) {
  85     try {
  86         string pattern = "";
  87
  88         if (type == "list") {  pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
  89         else if (type == "rabund") {  pattern = "[filename],[clustertag],rabund"; }
  90         else if (type == "sabund") {  pattern = "[filename],[clustertag],sabund"; }
  91         else if (type == "column") {  pattern = "[filename],dist"; }
  92         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
  93
  94         return pattern;
  95     }
  96     catch(exception& e) {
  97         m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
  98         exit(1);
  99     }
 100 }
 101 //**********************************************************************************************************************
 102 ClusterSplitCommand::ClusterSplitCommand(){
 103         try {
 104                 abort = true; calledHelp = true;
 105                 setParameters();
 106                 vector<string> tempOutNames;
 107                 outputTypes["list"] = tempOutNames;
 108                 outputTypes["rabund"] = tempOutNames;
 109                 outputTypes["sabund"] = tempOutNames;
 110                 outputTypes["column"] = tempOutNames;
 111         }
 112         catch(exception& e) {
 113                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 114                 exit(1);
 115         }
 116 }
 117 //**********************************************************************************************************************
 118 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
 119 ClusterSplitCommand::ClusterSplitCommand(string option)  {
 120         try{
 121                 abort = false; calledHelp = false;
 122                 format = "";
 123
 124                 //allow user to run help
 125                 if(option == "help") { help(); abort = true; calledHelp = true; }
 126                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
 127
 128                 else {
 129                         vector<string> myArray = setParameters();
 130
 131                         OptionParser parser(option);
 132                         map<string,string> parameters = parser.getParameters();
 133
 134                         ValidParameters validParameter("cluster.split");
 135
 136                         //check to make sure all parameters are valid for command
 137                         map<string,string>::iterator it;
 138                         for (it = parameters.begin(); it != parameters.end(); it++) {
 139                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
 140                                         abort = true;
 141                                 }
 142                         }
 143
 144                         //initialize outputTypes
 145                         vector<string> tempOutNames;
 146                         outputTypes["list"] = tempOutNames;
 147                         outputTypes["rabund"] = tempOutNames;
 148                         outputTypes["sabund"] = tempOutNames;
 149                         outputTypes["column"] = tempOutNames;
 150
 151                         //if the user changes the output directory command factory will send this info to us in the output parameter
 152                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
 153
 154                                 //if the user changes the input directory command factory will send this info to us in the output parameter
 155                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
 156                         if (inputDir == "not found"){   inputDir = "";          }
 157                         else {
 158                                 string path;
 159                                 it = parameters.find("phylip");
 160                                 //user has given a template file
 161                                 if(it != parameters.end()){
 162                                         path = m->hasPath(it->second);
 163                                         //if the user has not given a path then, add inputdir. else leave path alone.
 164                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
 165                                 }
 166
 167                                 it = parameters.find("column");
 168                                 //user has given a template file
 169                                 if(it != parameters.end()){
 170                                         path = m->hasPath(it->second);
 171                                         //if the user has not given a path then, add inputdir. else leave path alone.
 172                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
 173                                 }
 174
 175                                 it = parameters.find("name");
 176                                 //user has given a template file
 177                                 if(it != parameters.end()){
 178                                         path = m->hasPath(it->second);
 179                                         //if the user has not given a path then, add inputdir. else leave path alone.
 180                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
 181                                 }
 182
 183                                 it = parameters.find("taxonomy");
 184                                 //user has given a template file
 185                                 if(it != parameters.end()){
 186                                         path = m->hasPath(it->second);
 187                                         //if the user has not given a path then, add inputdir. else leave path alone.
 188                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
 189                                 }
 190
 191                                 it = parameters.find("fasta");
 192                                 //user has given a template file
 193                                 if(it != parameters.end()){
 194                                         path = m->hasPath(it->second);
 195                                         //if the user has not given a path then, add inputdir. else leave path alone.
 196                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
 197                                 }
 198
 199                 it = parameters.find("count");
 200                                 //user has given a template file
 201                                 if(it != parameters.end()){
 202                                         path = m->hasPath(it->second);
 203                                         //if the user has not given a path then, add inputdir. else leave path alone.
 204                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
 205                                 }
 206                         }
 207
 208                         //check for required parameters
 209                         phylipfile = validParameter.validFile(parameters, "phylip", true);
 210                         if (phylipfile == "not open") { abort = true; }
 211                         else if (phylipfile == "not found") { phylipfile = ""; }
 212                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
 213
 214                         columnfile = validParameter.validFile(parameters, "column", true);
 215                         if (columnfile == "not open") { abort = true; }
 216                         else if (columnfile == "not found") { columnfile = ""; }
 217                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
 218
 219                         namefile = validParameter.validFile(parameters, "name", true);
 220                         if (namefile == "not open") { abort = true; namefile = "";}
 221                         else if (namefile == "not found") { namefile = "";  }
 222                         else { m->setNameFile(namefile); }
 223
 224             countfile = validParameter.validFile(parameters, "count", true);
 225                         if (countfile == "not open") { abort = true; countfile = "";}
 226                         else if (countfile == "not found") { countfile = "";  }
 227                         else { m->setCountTableFile(countfile); }
 228
 229                         fastafile = validParameter.validFile(parameters, "fasta", true);
 230                         if (fastafile == "not open") { abort = true; }
 231                         else if (fastafile == "not found") { fastafile = ""; }
 232                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
 233
 234                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
 235                         if (taxFile == "not open") { taxFile = ""; abort = true; }
 236                         else if (taxFile == "not found") { taxFile = ""; }
 237                         else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
 238
 239                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
 240                                 //is there are current file available for either of these?
 241                                 //give priority to column, then phylip, then fasta
 242                                 columnfile = m->getColumnFile();
 243                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
 244                                 else {
 245                                         phylipfile = m->getPhylipFile();
 246                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
 247                                         else {
 248                                                 fastafile = m->getFastaFile();
 249                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
 250                                                 else {
 251                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
 252                                                         abort = true;
 253                                                 }
 254                                         }
 255                                 }
 256                         }
 257                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
 258
 259             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
 260
 261                         if (columnfile != "") {
 262                                 if ((namefile == "") && (countfile == "")) {
 263                                         namefile = m->getNameFile();
 264                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 265                                         else {
 266                                                 countfile = m->getCountTableFile();
 267                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
 268                         else {
 269                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
 270                             abort = true;
 271                         }
 272                                         }
 273                                 }
 274                         }
 275
 276                         if (fastafile != "") {
 277                                 if (taxFile == "") {
 278                                         taxFile = m->getTaxonomyFile();
 279                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
 280                                         else {
 281                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
 282                                                 abort = true;
 283                                         }
 284                                 }
 285
 286                                 if ((namefile == "") && (countfile == "")) {
 287                                         namefile = m->getNameFile();
 288                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 289                                         else {
 290                                                 countfile = m->getCountTableFile();
 291                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
 292                         else {
 293                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine();
 294                             abort = true;
 295                         }
 296                                         }
 297                                 }
 298                         }
 299
 300                         //check for optional parameter and set defaults
 301                         // ...at some point should added some additional type checking...
 302                         //get user cutoff and precision or use defaults
 303                         string temp;
 304                         temp = validParameter.validFile(parameters, "precision", false);
 305                         if (temp == "not found") { temp = "100"; }
 306                         //saves precision legnth for formatting below
 307                         length = temp.length();
 308                         m->mothurConvert(temp, precision);
 309
 310                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
 311                         hard = m->isTrue(temp);
 312
 313                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
 314                         large = m->isTrue(temp);
 315
 316                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
 317                         m->setProcessors(temp);
 318                         m->mothurConvert(temp, processors);
 319
 320                         temp = validParameter.validFile(parameters, "splitmethod", false);
 321                         if ((splitmethod != "fasta") && (splitmethod != "classify")) {
 322                                 if (temp == "not found")  { splitmethod = "distance"; }
 323                                 else {  splitmethod = temp; }
 324                         }
 325
 326             temp = validParameter.validFile(parameters, "classic", false);                      if (temp == "not found") { temp = "F"; }
 327                         classic = m->isTrue(temp);
 328
 329             if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
 330
 331                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
 332                         m->mothurConvert(temp, cutoff);
 333                         cutoff += (5 / (precision * 10.0));
 334
 335                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
 336                         m->mothurConvert(temp, taxLevelCutoff);
 337
 338                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
 339
 340                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
 341                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 342
 343                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
 344                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
 345
 346                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
 347
 348                         showabund = validParameter.validFile(parameters, "showabund", false);
 349                         if (showabund == "not found") { showabund = "T"; }
 350
 351             temp = validParameter.validFile(parameters, "cluster", false);  if (temp == "not found") { temp = "T"; }
 352             runCluster = m->isTrue(temp);
 353
 354                         timing = validParameter.validFile(parameters, "timing", false);
 355                         if (timing == "not found") { timing = "F"; }
 356
 357                 }
 358         }
 359         catch(exception& e) {
 360                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 361                 exit(1);
 362         }
 363 }
 364
 365 //**********************************************************************************************************************
 366
 367 int ClusterSplitCommand::execute(){
 368         try {
 369
 370                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
 371
 372                 time_t estart;
 373                 vector<string> listFileNames;
 374                 set<string> labels;
 375                 string singletonName = "";
 376                 double saveCutoff = cutoff;
 377
 378                 //****************** file prep work ******************************//
 379                 #ifdef USE_MPI
 380                         int pid;
 381                         int tag = 2001;
 382                         MPI_Status status;
 383                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
 384                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 385
 386                         if (pid == 0) { //only process 0 converts and splits
 387
 388                 #endif
 389
 390                 //if user gave a phylip file convert to column file
 391                 if (format == "phylip") {
 392                         estart = time(NULL);
 393                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
 394
 395                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
 396
 397                         NameAssignment* nameMap = NULL;
 398                         convert->setFormat("phylip");
 399                         convert->read(nameMap);
 400
 401                         if (m->control_pressed) {  delete convert;  return 0;  }
 402
 403                         distfile = convert->getOutputFile();
 404
 405                         //if no names file given with phylip file, create it
 406                         ListVector* listToMakeNameFile =  convert->getListVector();
 407                         if ((namefile == "") && (countfile == "")) {  //you need to make a namefile for split matrix
 408                                 ofstream out;
 409                                 namefile = phylipfile + ".names";
 410                                 m->openOutputFile(namefile, out);
 411                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
 412                                         string bin = listToMakeNameFile->get(i);
 413                                         out << bin << '\t' << bin << endl;
 414                                 }
 415                                 out.close();
 416                         }
 417                         delete listToMakeNameFile;
 418                         delete convert;
 419
 420                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
 421                 }
 422                 if (m->control_pressed) { return 0; }
 423
 424                 estart = time(NULL);
 425                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
 426
 427                 //split matrix into non-overlapping groups
 428                 SplitMatrix* split;
 429                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large);                                                    }
 430                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large);                                    }
 431                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);  }
 432                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
 433
 434                 split->split();
 435
 436                 if (m->control_pressed) { delete split; return 0; }
 437
 438                 singletonName = split->getSingletonNames();
 439                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
 440                 delete split;
 441
 442         if (m->debug) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); }
 443
 444                 //output a merged distance file
 445                 //if (splitmethod == "fasta")           { createMergedDistanceFile(distName); }
 446
 447                 if (m->control_pressed) { return 0; }
 448
 449                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
 450                 estart = time(NULL);
 451
 452         if (!runCluster) {
 453
 454                 m->mothurOutEndLine();
 455                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 456                 for (int i = 0; i < distName.size(); i++) {     m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine();      }
 457                 m->mothurOutEndLine();
 458                 return 0;
 459
 460         }
 461
 462                 //****************** break up files between processes and cluster each file set ******************************//
 463         #ifdef USE_MPI
 464                         ////you are process 0 from above////
 465
 466                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 467                         dividedNames.resize(processors);
 468
 469                         //for each file group figure out which process will complete it
 470                         //want to divide the load intelligently so the big files are spread between processes
 471                         for (int i = 0; i < distName.size(); i++) {
 472                                 int processToAssign = (i+1) % processors;
 473                                 if (processToAssign == 0) { processToAssign = processors; }
 474
 475                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 476                         }
 477
 478                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 479                         for (int i = 0; i < processors; i++) {
 480                                 int remainder = ((i+1) % processors);
 481                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 482                         }
 483
 484
 485                         //send each child the list of files it needs to process
 486                         for(int i = 1; i < processors; i++) {
 487                                 //send number of file pairs
 488                                 int num = dividedNames[i].size();
 489                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 490
 491                                 for (int j = 0; j < num; j++) { //send filenames to process i
 492                                         char tempDistFileName[1024];
 493                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
 494                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
 495
 496                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 497                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 498
 499                                         char tempNameFileName[1024];
 500                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
 501                                         int lengthName = (dividedNames[i][j].begin()->second).length();
 502
 503                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 504                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 505                                 }
 506                         }
 507
 508                         //process your share
 509                         listFileNames = cluster(dividedNames[0], labels);
 510
 511                         //receive the other processes info
 512                         for(int i = 1; i < processors; i++) {
 513                                 int num = dividedNames[i].size();
 514
 515                                 double tempCutoff;
 516                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
 517                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
 518
 519                                 //send list filenames to root process
 520                                 for (int j = 0; j < num; j++) {
 521                                         int lengthList = 0;
 522                                         char tempListFileName[1024];
 523
 524                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 525                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 526
 527                                         string myListFileName = tempListFileName;
 528                                         myListFileName = myListFileName.substr(0, lengthList);
 529
 530                                         listFileNames.push_back(myListFileName);
 531                                 }
 532
 533                                 //send Labels to root process
 534                                 int numLabels = 0;
 535                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 536
 537                                 for (int j = 0; j < numLabels; j++) {
 538                                         int lengthLabel = 0;
 539                                         char tempLabel[100];
 540
 541                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 542                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 543
 544                                         string myLabel = tempLabel;
 545                                         myLabel = myLabel.substr(0, lengthLabel);
 546
 547                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
 548                                 }
 549                         }
 550
 551                 }else { //you are a child process
 552                         vector < map<string, string> >  myNames;
 553
 554                         //recieve the files you need to process
 555                         //receive number of file pairs
 556                         int num = 0;
 557                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 558
 559                         myNames.resize(num);
 560
 561                         for (int j = 0; j < num; j++) { //receive filenames to process
 562                                 int lengthDist = 0;
 563                                 char tempDistFileName[1024];
 564
 565                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 566                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 567
 568                                 string myDistFileName = tempDistFileName;
 569                                 myDistFileName = myDistFileName.substr(0, lengthDist);
 570
 571                                 int lengthName = 0;
 572                                 char tempNameFileName[1024];
 573
 574                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 575                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 576
 577                                 string myNameFileName = tempNameFileName;
 578                                 myNameFileName = myNameFileName.substr(0, lengthName);
 579
 580                                 //save file name
 581                                 myNames[j][myDistFileName] = myNameFileName;
 582                         }
 583
 584                         //process them
 585                         listFileNames = cluster(myNames, labels);
 586
 587                         //send cutoff
 588                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
 589
 590                         //send list filenames to root process
 591                         for (int j = 0; j < num; j++) {
 592                                 char tempListFileName[1024];
 593                                 strcpy(tempListFileName, listFileNames[j].c_str());
 594                                 int lengthList = listFileNames[j].length();
 595
 596                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 597                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 598                         }
 599
 600                         //send Labels to root process
 601                         int numLabels = labels.size();
 602                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 603
 604                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
 605                                 char tempLabel[100];
 606                                 strcpy(tempLabel, (*it).c_str());
 607                                 int lengthLabel = (*it).length();
 608
 609                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 610                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 611                         }
 612                 }
 613
 614                 //make everyone wait
 615                 MPI_Barrier(MPI_COMM_WORLD);
 616
 617         #else
 618                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
 619                 //sanity check
 620                 if (processors > distName.size()) { processors = distName.size(); }
 621
 622                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 623                                 if(processors == 1){
 624                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 625                                 }else{
 626                                         listFileNames = createProcesses(distName, labels);
 627                 }
 628                 #else
 629                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 630                 #endif
 631         #endif
 632                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
 633
 634                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
 635
 636                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
 637
 638                 //****************** merge list file and create rabund and sabund files ******************************//
 639                 estart = time(NULL);
 640                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
 641
 642                 #ifdef USE_MPI
 643                         if (pid == 0) { //only process 0 merges
 644                 #endif
 645
 646                 ListVector* listSingle;
 647                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 648
 649                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 650
 651                 mergeLists(listFileNames, labelBins, listSingle);
 652
 653                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 654
 655                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 656
 657                 //set list file as new current listfile
 658                 string current = "";
 659                 itTypes = outputTypes.find("list");
 660                 if (itTypes != outputTypes.end()) {
 661                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
 662                 }
 663
 664                 //set rabund file as new current rabundfile
 665                 itTypes = outputTypes.find("rabund");
 666                 if (itTypes != outputTypes.end()) {
 667                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
 668                 }
 669
 670                 //set sabund file as new current sabundfile
 671                 itTypes = outputTypes.find("sabund");
 672                 if (itTypes != outputTypes.end()) {
 673                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
 674                 }
 675
 676                 m->mothurOutEndLine();
 677                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 678                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
 679                 m->mothurOutEndLine();
 680
 681                 #ifdef USE_MPI
 682                         } //only process 0 merges
 683
 684                         //make everyone wait
 685                         MPI_Barrier(MPI_COMM_WORLD);
 686                 #endif
 687
 688                 return 0;
 689         }
 690         catch(exception& e) {
 691                 m->errorOut(e, "ClusterSplitCommand", "execute");
 692                 exit(1);
 693         }
 694 }
 695 //**********************************************************************************************************************
 696 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
 697         try {
 698
 699                 map<float, int> labelBin;
 700                 vector<float> orderFloat;
 701                 int numSingleBins;
 702
 703                 //read in singletons
 704                 if (singleton != "none") {
 705
 706             ifstream in;
 707             m->openInputFile(singleton, in);
 708
 709                         string firstCol, secondCol;
 710                         listSingle = new ListVector();
 711
 712             if (countfile != "") { m->getline(in); m->gobble(in); }
 713
 714                         while (!in.eof()) {
 715                                 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
 716                                 if (countfile == "") { listSingle->push_back(secondCol); }
 717                 else { listSingle->push_back(firstCol); }
 718                         }
 719
 720                         in.close();
 721                         m->mothurRemove(singleton);
 722
 723                         numSingleBins = listSingle->getNumBins();
 724                 }else{  listSingle = NULL; numSingleBins = 0;  }
 725
 726                 //go through users set and make them floats so we can sort them
 727                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
 728                         float temp = -10.0;
 729
 730                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
 731                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
 732
 733                         if (temp <= cutoff) {
 734                                 orderFloat.push_back(temp);
 735                                 labelBin[temp] = numSingleBins; //initialize numbins
 736                         }
 737                 }
 738
 739                 //sort order
 740                 sort(orderFloat.begin(), orderFloat.end());
 741                 userLabels.clear();
 742
 743                 //get the list info from each file
 744                 for (int k = 0; k < listNames.size(); k++) {
 745
 746                         if (m->control_pressed) {
 747                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
 748                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
 749                                 return labelBin;
 750                         }
 751
 752                         InputData* input = new InputData(listNames[k], "list");
 753                         ListVector* list = input->getListVector();
 754                         string lastLabel = list->getLabel();
 755
 756                         string filledInList = listNames[k] + "filledInTemp";
 757                         ofstream outFilled;
 758                         m->openOutputFile(filledInList, outFilled);
 759
 760                         //for each label needed
 761                         for(int l = 0; l < orderFloat.size(); l++){
 762
 763                                 string thisLabel;
 764                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
 765                                 else { thisLabel = toString(orderFloat[l],  length-1);  }
 766
 767                                 //this file has reached the end
 768                                 if (list == NULL) {
 769                                         list = input->getListVector(lastLabel, true);
 770                                 }else{  //do you have the distance, or do you need to fill in
 771
 772                                         float labelFloat;
 773                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
 774                                         else { convert(list->getLabel(), labelFloat); }
 775
 776                                         //check for missing labels
 777                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
 778                                                 //if its bigger get last label, otherwise keep it
 779                                                 delete list;
 780                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
 781                                         }
 782                                         lastLabel = list->getLabel();
 783                                 }
 784
 785                                 //print to new file
 786                                 list->setLabel(thisLabel);
 787                                 list->print(outFilled);
 788
 789                                 //update labelBin
 790                                 labelBin[orderFloat[l]] += list->getNumBins();
 791
 792                                 delete list;
 793
 794                                 list = input->getListVector();
 795                         }
 796
 797                         if (list != NULL) { delete list; }
 798                         delete input;
 799
 800                         outFilled.close();
 801                         m->mothurRemove(listNames[k]);
 802                         rename(filledInList.c_str(), listNames[k].c_str());
 803                 }
 804
 805                 return labelBin;
 806         }
 807         catch(exception& e) {
 808                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
 809                 exit(1);
 810         }
 811 }
 812 //**********************************************************************************************************************
 813 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
 814         try {
 815                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
 816                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
 817
 818         map<string, string> variables;
 819         variables["[filename]"] = fileroot;
 820         variables["[clustertag]"] = tag;
 821         string sabundFileName = getOutputFileName("sabund", variables);
 822         string rabundFileName = getOutputFileName("rabund", variables);
 823         if (countfile != "") { variables["[tag2]"] = "unique_list"; }
 824         string listFileName = getOutputFileName("list", variables);
 825
 826         if (countfile == "") {
 827             m->openOutputFile(sabundFileName,   outSabund);
 828             m->openOutputFile(rabundFileName,   outRabund);
 829             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
 830             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
 831
 832         }
 833                 m->openOutputFile(listFileName, outList);
 834         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
 835
 836
 837                 map<float, int>::iterator itLabel;
 838
 839                 //for each label needed
 840                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
 841
 842                         string thisLabel;
 843                         if (itLabel->first == -1) { thisLabel = "unique"; }
 844                         else { thisLabel = toString(itLabel->first,  length-1);  }
 845
 846                         outList << thisLabel << '\t' << itLabel->second << '\t';
 847
 848             RAbundVector* rabund = NULL;
 849             if (countfile == "") {
 850                 rabund = new RAbundVector();
 851                 rabund->setLabel(thisLabel);
 852             }
 853
 854                         //add in singletons
 855                         if (listSingle != NULL) {
 856                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
 857                                         outList << listSingle->get(j) << '\t';
 858                                         if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
 859                                 }
 860                         }
 861
 862                         //get the list info from each file
 863                         for (int k = 0; k < listNames.size(); k++) {
 864
 865                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } if (rabund != NULL) { delete rabund; } return 0; }
 866
 867                                 InputData* input = new InputData(listNames[k], "list");
 868                                 ListVector* list = input->getListVector(thisLabel);
 869
 870                                 //this file has reached the end
 871                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }
 872                                 else {
 873                                         for (int j = 0; j < list->getNumBins(); j++) {
 874                                                 outList << list->get(j) << '\t';
 875                                                 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
 876                                         }
 877                                         delete list;
 878                                 }
 879                                 delete input;
 880                         }
 881
 882             if (countfile == "") {
 883                 SAbundVector sabund = rabund->getSAbundVector();
 884                 sabund.print(outSabund);
 885                 rabund->print(outRabund);
 886             }
 887                         outList << endl;
 888
 889                         if (rabund != NULL) { delete rabund; }
 890                 }
 891
 892                 outList.close();
 893         if (countfile == "") {
 894             outRabund.close();
 895             outSabund.close();
 896                 }
 897                 if (listSingle != NULL) { delete listSingle;  }
 898
 899                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
 900
 901                 return 0;
 902         }
 903         catch(exception& e) {
 904                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
 905                 exit(1);
 906         }
 907 }
 908
 909 //**********************************************************************************************************************
 910
 911 void ClusterSplitCommand::printData(ListVector* oldList){
 912         try {
 913                 string label = oldList->getLabel();
 914                 RAbundVector oldRAbund = oldList->getRAbundVector();
 915
 916                 oldRAbund.setLabel(label);
 917                 if (m->isTrue(showabund)) {
 918                         oldRAbund.getSAbundVector().print(cout);
 919                 }
 920                 oldRAbund.print(outRabund);
 921                 oldRAbund.getSAbundVector().print(outSabund);
 922
 923                 oldList->print(outList);
 924         }
 925         catch(exception& e) {
 926                 m->errorOut(e, "ClusterSplitCommand", "printData");
 927                 exit(1);
 928         }
 929 }
 930 //**********************************************************************************************************************
 931 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
 932         try {
 933
 934         vector<string> listFiles;
 935         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 936         dividedNames.resize(processors);
 937
 938         //for each file group figure out which process will complete it
 939         //want to divide the load intelligently so the big files are spread between processes
 940         for (int i = 0; i < distName.size(); i++) {
 941             //cout << i << endl;
 942             int processToAssign = (i+1) % processors;
 943             if (processToAssign == 0) { processToAssign = processors; }
 944
 945             dividedNames[(processToAssign-1)].push_back(distName[i]);
 946             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
 947         }
 948
 949         //now lets reverse the order of ever other process, so we balance big files running with little ones
 950         for (int i = 0; i < processors; i++) {
 951             //cout << i << endl;
 952             int remainder = ((i+1) % processors);
 953             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 954         }
 955
 956         if (m->control_pressed) { return listFiles; }
 957
 958         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 959                 int process = 1;
 960                 processIDS.clear();
 961
 962                 //loop through and create all the processes you want
 963                 while (process != processors) {
 964                         int pid = fork();
 965
 966                         if (pid > 0) {
 967                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 968                                 process++;
 969                         }else if (pid == 0){
 970                                 set<string> labels;
 971                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
 972
 973                                 //write out names to file
 974                                 string filename = toString(getpid()) + ".temp";
 975                                 ofstream out;
 976                                 m->openOutputFile(filename, out);
 977                                 out << tag << endl;
 978                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
 979                                 out.close();
 980
 981                                 //print out labels
 982                                 ofstream outLabels;
 983                                 filename = toString(getpid()) + ".temp.labels";
 984                                 m->openOutputFile(filename, outLabels);
 985
 986                                 outLabels << cutoff << endl;
 987                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
 988                                         outLabels << (*it) << endl;
 989                                 }
 990                                 outLabels.close();
 991
 992                                 exit(0);
 993                         }else {
 994                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
 995                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
 996                                 exit(0);
 997                         }
 998                 }
 999
1000         //do your part
1001         listFiles = cluster(dividedNames[0], labels);
1002
1003                 //force parent to wait until all the processes are done
1004                 for (int i=0;i< processIDS.size();i++) {
1005                         int temp = processIDS[i];
1006                         wait(&temp);
1007                 }
1008
1009         //get list of list file names from each process
1010         for(int i=0;i<processIDS.size();i++){
1011             string filename = toString(processIDS[i]) + ".temp";
1012             ifstream in;
1013             m->openInputFile(filename, in);
1014
1015             in >> tag; m->gobble(in);
1016
1017             while(!in.eof()) {
1018                 string tempName;
1019                 in >> tempName; m->gobble(in);
1020                 listFiles.push_back(tempName);
1021             }
1022             in.close();
1023             m->mothurRemove((toString(processIDS[i]) + ".temp"));
1024
1025             //get labels
1026             filename = toString(processIDS[i]) + ".temp.labels";
1027             ifstream in2;
1028             m->openInputFile(filename, in2);
1029
1030             float tempCutoff;
1031             in2 >> tempCutoff; m->gobble(in2);
1032             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1033
1034             while(!in2.eof()) {
1035                 string tempName;
1036                 in2 >> tempName; m->gobble(in2);
1037                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1038             }
1039             in2.close();
1040             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1041         }
1042
1043
1044     #else
1045
1046         //////////////////////////////////////////////////////////////////////////////////////////////////////
1047                 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
1048                 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1049                 //Taking advantage of shared memory to allow both threads to add labels.
1050                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1051                 /*
1052                 vector<clusterData*> pDataArray;
1053                 DWORD   dwThreadIdArray[processors-1];
1054                 HANDLE  hThreadArray[processors-1];
1055
1056                 //Create processor worker threads.
1057                 for( int i=1; i<processors; i++ ){
1058                         // Allocate memory for thread data.
1059                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1060                         pDataArray.push_back(tempCluster);
1061                         processIDS.push_back(i);
1062
1063                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1064                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1065                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
1066
1067                 }
1068
1069         //do your part
1070         listFiles = cluster(dividedNames[0], labels);
1071
1072                 //Wait until all threads have terminated.
1073                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1074
1075                 //Close all thread handles and free memory allocations.
1076                 for(int i=0; i < pDataArray.size(); i++){
1077             //get tag
1078             tag = pDataArray[i]->tag;
1079             //get listfiles created
1080             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1081             //get labels
1082             set<string>::iterator it;
1083             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1084                         //check cutoff
1085             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1086                         CloseHandle(hThreadArray[i]);
1087                         delete pDataArray[i];
1088                 }
1089 */
1090         #endif
1091
1092         return listFiles;
1093
1094         }
1095         catch(exception& e) {
1096                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1097                 exit(1);
1098         }
1099 }
1100 //**********************************************************************************************************************
1101
1102 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1103         try {
1104
1105                 vector<string> listFileNames;
1106                 double smallestCutoff = cutoff;
1107
1108                 //cluster each distance file
1109                 for (int i = 0; i < distNames.size(); i++) {
1110
1111                         string thisNamefile = distNames[i].begin()->second;
1112                         string thisDistFile = distNames[i].begin()->first;
1113
1114                         string listFileName = "";
1115             if (classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff);   }
1116             else            {  listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff);          }
1117
1118                         if (m->control_pressed) { //clean up
1119                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1120                                 listFileNames.clear(); return listFileNames;
1121                         }
1122
1123             listFileNames.push_back(listFileName);
1124         }
1125
1126                 cutoff = smallestCutoff;
1127
1128                 return listFileNames;
1129
1130         }
1131         catch(exception& e) {
1132                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1133                 exit(1);
1134         }
1135
1136
1137 }
1138 //**********************************************************************************************************************
1139 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1140         try {
1141         string listFileName = "";
1142
1143         ListVector* list = NULL;
1144         ListVector oldList;
1145         RAbundVector* rabund = NULL;
1146
1147 #ifdef USE_MPI
1148         int pid;
1149         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1150
1151         //output your files too
1152         if (pid != 0) {
1153             cout << endl << "Reading " << thisDistFile << endl;
1154         }
1155 #endif
1156
1157         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1158
1159         //reads phylip file storing data in 2D vector, also fills list and rabund
1160         bool sim = false;
1161                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1162
1163         NameAssignment* nameMap = NULL;
1164         CountTable* ct = NULL;
1165         if(namefile != ""){
1166                         nameMap = new NameAssignment(thisNamefile);
1167                         nameMap->readMap();
1168             cluster->readPhylipFile(thisDistFile, nameMap);
1169                 }else if (countfile != "") {
1170             ct = new CountTable();
1171             ct->readTable(thisNamefile);
1172             cluster->readPhylipFile(thisDistFile, ct);
1173         }
1174         tag = cluster->getTag();
1175
1176                 if (m->control_pressed) { if(namefile != ""){   delete nameMap; }
1177             else { delete ct; } delete cluster; return 0; }
1178
1179                 list = cluster->getListVector();
1180                 rabund = cluster->getRAbundVector();
1181
1182                 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1183                 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1184         listFileName = fileroot+ tag + ".list";
1185
1186         ofstream listFile;
1187                 m->openOutputFile(fileroot+ tag + ".list",      listFile);
1188
1189                 float previousDist = 0.00000;
1190                 float rndPreviousDist = 0.00000;
1191                 oldList = *list;
1192
1193 #ifdef USE_MPI
1194         //output your files too
1195         if (pid != 0) {
1196             cout << endl << "Clustering " << thisDistFile << endl;
1197         }
1198 #endif
1199
1200         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1201
1202                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1203                         if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  if(namefile != ""){    delete nameMap; }
1204                 else { delete ct; } return listFileName;  }
1205
1206                         cluster->update(cutoff);
1207
1208                         float dist = cluster->getSmallDist();
1209                         float rndDist;
1210                         if (hard) {
1211                                 rndDist = m->ceilDist(dist, precision);
1212                         }else{
1213                                 rndDist = m->roundDist(dist, precision);
1214                         }
1215
1216             if(previousDist <= 0.0000 && dist != previousDist){
1217                 oldList.setLabel("unique");
1218                 oldList.print(listFile);
1219                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1220             }
1221             else if(rndDist != rndPreviousDist){
1222                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1223                 oldList.print(listFile);
1224                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1225             }
1226
1227
1228                         previousDist = dist;
1229                         rndPreviousDist = rndDist;
1230                         oldList = *list;
1231                 }
1232
1233                 if(previousDist <= 0.0000){
1234             oldList.setLabel("unique");
1235             oldList.print(listFile);
1236             if (labels.count("unique") == 0) { labels.insert("unique"); }
1237         }
1238         else if(rndPreviousDist<cutoff){
1239             oldList.setLabel(toString(rndPreviousDist,  length-1));
1240             oldList.print(listFile);
1241             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1242         }
1243
1244
1245                 listFile.close();
1246
1247                 delete cluster;  delete list; delete rabund;
1248         if(namefile != ""){     delete nameMap; }
1249         else { delete ct; }
1250
1251         m->mothurRemove(thisDistFile);
1252         m->mothurRemove(thisNamefile);
1253
1254         return listFileName;
1255
1256         }
1257         catch(exception& e) {
1258                 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1259                 exit(1);
1260         }
1261 }
1262
1263 //**********************************************************************************************************************
1264 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1265         try {
1266         string listFileName = "";
1267
1268         Cluster* cluster = NULL;
1269         SparseDistanceMatrix* matrix = NULL;
1270         ListVector* list = NULL;
1271         ListVector oldList;
1272         RAbundVector* rabund = NULL;
1273
1274         if (m->control_pressed) { return listFileName; }
1275
1276 #ifdef USE_MPI
1277         int pid;
1278         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1279
1280         //output your files too
1281         if (pid != 0) {
1282             cout << endl << "Reading " << thisDistFile << endl;
1283         }
1284 #endif
1285
1286         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1287
1288         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1289         read->setCutoff(cutoff);
1290
1291         NameAssignment* nameMap = NULL;
1292         CountTable* ct = NULL;
1293                 if(namefile != ""){
1294                         nameMap = new NameAssignment(thisNamefile);
1295                         nameMap->readMap();
1296             read->read(nameMap);
1297                 }else if (countfile != "") {
1298             ct = new CountTable();
1299             ct->readTable(thisNamefile);
1300             read->read(ct);
1301         }else { read->read(nameMap); }
1302
1303                 list = read->getListVector();
1304         oldList = *list;
1305                 matrix = read->getDMatrix();
1306
1307                 if(countfile != "") {
1308             rabund = new RAbundVector();
1309             createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1310             delete ct;
1311         }else { rabund = new RAbundVector(list->getRAbundVector()); }
1312
1313         delete read;  read = NULL;
1314         if (namefile != "") { delete nameMap; nameMap = NULL; }
1315
1316
1317 #ifdef USE_MPI
1318         //output your files too
1319         if (pid != 0) {
1320             cout << endl << "Clustering " << thisDistFile << endl;
1321         }
1322 #endif
1323
1324         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1325
1326         //create cluster
1327         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1328         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1329         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
1330         tag = cluster->getTag();
1331
1332         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1333         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1334
1335         ofstream listFile;
1336         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1337
1338         listFileName = fileroot+ tag + ".list";
1339
1340         float previousDist = 0.00000;
1341         float rndPreviousDist = 0.00000;
1342
1343         oldList = *list;
1344
1345         print_start = true;
1346         start = time(NULL);
1347         double saveCutoff = cutoff;
1348
1349         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1350
1351             if (m->control_pressed) { //clean up
1352                 delete matrix; delete list;     delete cluster; delete rabund;
1353                 listFile.close();
1354                 m->mothurRemove(listFileName);
1355                 return listFileName;
1356             }
1357
1358             cluster->update(saveCutoff);
1359
1360             float dist = matrix->getSmallDist();
1361             float rndDist;
1362             if (hard) {
1363                 rndDist = m->ceilDist(dist, precision);
1364             }else{
1365                 rndDist = m->roundDist(dist, precision);
1366             }
1367
1368             if(previousDist <= 0.0000 && dist != previousDist){
1369                 oldList.setLabel("unique");
1370                 oldList.print(listFile);
1371                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1372             }
1373             else if(rndDist != rndPreviousDist){
1374                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1375                 oldList.print(listFile);
1376                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1377             }
1378
1379             previousDist = dist;
1380             rndPreviousDist = rndDist;
1381             oldList = *list;
1382         }
1383
1384
1385         if(previousDist <= 0.0000){
1386             oldList.setLabel("unique");
1387             oldList.print(listFile);
1388             if (labels.count("unique") == 0) { labels.insert("unique"); }
1389         }
1390         else if(rndPreviousDist<cutoff){
1391             oldList.setLabel(toString(rndPreviousDist,  length-1));
1392             oldList.print(listFile);
1393             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1394         }
1395
1396         delete matrix; delete list;     delete cluster; delete rabund;
1397         matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1398         listFile.close();
1399
1400         if (m->control_pressed) { //clean up
1401             m->mothurRemove(listFileName);
1402             return listFileName;
1403         }
1404
1405         m->mothurRemove(thisDistFile);
1406         m->mothurRemove(thisNamefile);
1407
1408         if (saveCutoff != cutoff) {
1409             if (hard)   {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1410             else                {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1411
1412             m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1413         }
1414
1415         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1416
1417         return listFileName;
1418
1419         }
1420         catch(exception& e) {
1421                 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1422                 exit(1);
1423         }
1424 }
1425 //**********************************************************************************************************************
1426
1427 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1428         try{
1429
1430 #ifdef USE_MPI
1431                 int pid;
1432                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1433
1434                 if (pid != 0) {
1435 #endif
1436
1437                 string thisOutputDir = outputDir;
1438                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1439         map<string, string> variables;
1440         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
1441                 string outputFileName = getOutputFileName("column", variables);
1442                 m->mothurRemove(outputFileName);
1443
1444
1445                 for (int i = 0; i < distNames.size(); i++) {
1446                         if (m->control_pressed) {  return 0; }
1447
1448                         string thisDistFile = distNames[i].begin()->first;
1449
1450                         m->appendFiles(thisDistFile, outputFileName);
1451                 }
1452
1453                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1454
1455 #ifdef USE_MPI
1456                 }
1457 #endif
1458
1459                 return 0;
1460
1461
1462         }
1463         catch(exception& e) {
1464                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1465                 exit(1);
1466         }
1467 }
1468 //**********************************************************************************************************************
1469 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1470     try {
1471         rabund->setLabel(list->getLabel());
1472         for(int i = 0; i < list->getNumBins(); i++) {
1473             if (m->control_pressed) { break; }
1474             vector<string> binNames;
1475             string bin = list->get(i);
1476             m->splitAtComma(bin, binNames);
1477             int total = 0;
1478             for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
1479             rabund->push_back(total);
1480         }
1481         return 0;
1482     }
1483     catch(exception& e) {
1484                 m->errorOut(e, "ClusterCommand", "createRabund");
1485                 exit(1);
1486         }
1487
1488 }
1489 //**********************************************************************************************************************