clustersplitcommand.cpp

   1 /*
   2  *  clustersplitcommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 5/19/10.
   6  *  Copyright 2010 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "clustersplitcommand.h"
  11
  12
  13 //**********************************************************************************************************************
  14 vector<string> ClusterSplitCommand::setParameters(){
  15         try {
  16                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
  17                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
  18                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
  19                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
  20                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
  21                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
  22                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
  23                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
  24                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
  25                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
  26                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
  27                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff);
  28                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
  29                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
  30                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
  31         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pclassic);
  32                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
  33                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
  34
  35                 vector<string> myArray;
  36                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
  37                 return myArray;
  38         }
  39         catch(exception& e) {
  40                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
  41                 exit(1);
  42         }
  43 }
  44 //**********************************************************************************************************************
  45 string ClusterSplitCommand::getHelpString(){
  46         try {
  47                 string helpString = "";
  48                 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n";
  49                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
  50                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
  51                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
  52                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
  53                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
  54                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
  55                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
  56                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
  57                 helpString += "The name parameter allows you to enter your name file and is required if your distance file is in column format. \n";
  58                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
  59                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
  60                 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
  61                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
  62                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
  63                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
  64                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
  65         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic.  It is only valid with splitmethod=fasta. Default=f.\n";
  66 #ifdef USE_MPI
  67                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
  68 #endif
  69                 helpString += "The cluster.split command should be in the following format: \n";
  70                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
  71                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
  72                 return helpString;
  73         }
  74         catch(exception& e) {
  75                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
  76                 exit(1);
  77         }
  78 }
  79 //**********************************************************************************************************************
  80 ClusterSplitCommand::ClusterSplitCommand(){
  81         try {
  82                 abort = true; calledHelp = true;
  83                 setParameters();
  84                 vector<string> tempOutNames;
  85                 outputTypes["list"] = tempOutNames;
  86                 outputTypes["rabund"] = tempOutNames;
  87                 outputTypes["sabund"] = tempOutNames;
  88                 outputTypes["column"] = tempOutNames;
  89         }
  90         catch(exception& e) {
  91                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
  92                 exit(1);
  93         }
  94 }
  95 //**********************************************************************************************************************
  96 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
  97 ClusterSplitCommand::ClusterSplitCommand(string option)  {
  98         try{
  99                 abort = false; calledHelp = false;
 100                 format = "";
 101
 102                 //allow user to run help
 103                 if(option == "help") { help(); abort = true; calledHelp = true; }
 104                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
 105
 106                 else {
 107                         vector<string> myArray = setParameters();
 108
 109                         OptionParser parser(option);
 110                         map<string,string> parameters = parser.getParameters();
 111
 112                         ValidParameters validParameter("cluster.split");
 113
 114                         //check to make sure all parameters are valid for command
 115                         map<string,string>::iterator it;
 116                         for (it = parameters.begin(); it != parameters.end(); it++) {
 117                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
 118                                         abort = true;
 119                                 }
 120                         }
 121
 122                         //initialize outputTypes
 123                         vector<string> tempOutNames;
 124                         outputTypes["list"] = tempOutNames;
 125                         outputTypes["rabund"] = tempOutNames;
 126                         outputTypes["sabund"] = tempOutNames;
 127                         outputTypes["column"] = tempOutNames;
 128
 129                         //if the user changes the output directory command factory will send this info to us in the output parameter
 130                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
 131
 132                                 //if the user changes the input directory command factory will send this info to us in the output parameter
 133                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
 134                         if (inputDir == "not found"){   inputDir = "";          }
 135                         else {
 136                                 string path;
 137                                 it = parameters.find("phylip");
 138                                 //user has given a template file
 139                                 if(it != parameters.end()){
 140                                         path = m->hasPath(it->second);
 141                                         //if the user has not given a path then, add inputdir. else leave path alone.
 142                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
 143                                 }
 144
 145                                 it = parameters.find("column");
 146                                 //user has given a template file
 147                                 if(it != parameters.end()){
 148                                         path = m->hasPath(it->second);
 149                                         //if the user has not given a path then, add inputdir. else leave path alone.
 150                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
 151                                 }
 152
 153                                 it = parameters.find("name");
 154                                 //user has given a template file
 155                                 if(it != parameters.end()){
 156                                         path = m->hasPath(it->second);
 157                                         //if the user has not given a path then, add inputdir. else leave path alone.
 158                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
 159                                 }
 160
 161                                 it = parameters.find("taxonomy");
 162                                 //user has given a template file
 163                                 if(it != parameters.end()){
 164                                         path = m->hasPath(it->second);
 165                                         //if the user has not given a path then, add inputdir. else leave path alone.
 166                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
 167                                 }
 168
 169                                 it = parameters.find("fasta");
 170                                 //user has given a template file
 171                                 if(it != parameters.end()){
 172                                         path = m->hasPath(it->second);
 173                                         //if the user has not given a path then, add inputdir. else leave path alone.
 174                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
 175                                 }
 176                         }
 177
 178                         //check for required parameters
 179                         phylipfile = validParameter.validFile(parameters, "phylip", true);
 180                         if (phylipfile == "not open") { abort = true; }
 181                         else if (phylipfile == "not found") { phylipfile = ""; }
 182                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
 183
 184                         columnfile = validParameter.validFile(parameters, "column", true);
 185                         if (columnfile == "not open") { abort = true; }
 186                         else if (columnfile == "not found") { columnfile = ""; }
 187                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
 188
 189                         namefile = validParameter.validFile(parameters, "name", true);
 190                         if (namefile == "not open") { abort = true; }
 191                         else if (namefile == "not found") { namefile = "";  }
 192                         else { m->setNameFile(namefile); }
 193
 194                         fastafile = validParameter.validFile(parameters, "fasta", true);
 195                         if (fastafile == "not open") { abort = true; }
 196                         else if (fastafile == "not found") { fastafile = ""; }
 197                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
 198
 199                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
 200                         if (taxFile == "not open") { taxFile = ""; abort = true; }
 201                         else if (taxFile == "not found") { taxFile = ""; }
 202                         else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
 203
 204                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
 205                                 //is there are current file available for either of these?
 206                                 //give priority to column, then phylip, then fasta
 207                                 columnfile = m->getColumnFile();
 208                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
 209                                 else {
 210                                         phylipfile = m->getPhylipFile();
 211                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
 212                                         else {
 213                                                 fastafile = m->getFastaFile();
 214                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
 215                                                 else {
 216                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
 217                                                         abort = true;
 218                                                 }
 219                                         }
 220                                 }
 221                         }
 222                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
 223
 224                         if (columnfile != "") {
 225                                 if (namefile == "") {
 226                                         namefile = m->getNameFile();
 227                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 228                                         else {
 229                                                 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine();
 230                                                 abort = true;
 231                                         }
 232                                 }
 233                         }
 234
 235                         if (fastafile != "") {
 236                                 if (taxFile == "") {
 237                                         taxFile = m->getTaxonomyFile();
 238                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
 239                                         else {
 240                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
 241                                                 abort = true;
 242                                         }
 243                                 }
 244
 245                                 if (namefile == "") {
 246                                         namefile = m->getNameFile();
 247                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 248                                         else {
 249                                                 m->mothurOut("You need to provide a namefile if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
 250                                                 abort = true;
 251                                         }
 252                                 }
 253                         }
 254
 255                         //check for optional parameter and set defaults
 256                         // ...at some point should added some additional type checking...
 257                         //get user cutoff and precision or use defaults
 258                         string temp;
 259                         temp = validParameter.validFile(parameters, "precision", false);
 260                         if (temp == "not found") { temp = "100"; }
 261                         //saves precision legnth for formatting below
 262                         length = temp.length();
 263                         m->mothurConvert(temp, precision);
 264
 265                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
 266                         hard = m->isTrue(temp);
 267
 268                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
 269                         large = m->isTrue(temp);
 270
 271                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
 272                         m->setProcessors(temp);
 273                         m->mothurConvert(temp, processors);
 274
 275                         temp = validParameter.validFile(parameters, "splitmethod", false);
 276                         if ((splitmethod != "fasta") && (splitmethod != "classify")) {
 277                                 if (temp == "not found")  { splitmethod = "distance"; }
 278                                 else {  splitmethod = temp; }
 279                         }
 280
 281             temp = validParameter.validFile(parameters, "classic", false);                      if (temp == "not found") { temp = "F"; }
 282                         classic = m->isTrue(temp);
 283
 284             if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
 285
 286                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
 287                         m->mothurConvert(temp, cutoff);
 288                         cutoff += (5 / (precision * 10.0));
 289
 290                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
 291                         m->mothurConvert(temp, taxLevelCutoff);
 292
 293                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
 294
 295                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
 296                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 297
 298                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
 299                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
 300
 301                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
 302
 303                         showabund = validParameter.validFile(parameters, "showabund", false);
 304                         if (showabund == "not found") { showabund = "T"; }
 305
 306                         timing = validParameter.validFile(parameters, "timing", false);
 307                         if (timing == "not found") { timing = "F"; }
 308
 309                 }
 310         }
 311         catch(exception& e) {
 312                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 313                 exit(1);
 314         }
 315 }
 316
 317 //**********************************************************************************************************************
 318
 319 int ClusterSplitCommand::execute(){
 320         try {
 321
 322                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
 323
 324                 time_t estart;
 325                 vector<string> listFileNames;
 326                 set<string> labels;
 327                 string singletonName = "";
 328                 double saveCutoff = cutoff;
 329
 330                 //****************** file prep work ******************************//
 331                 #ifdef USE_MPI
 332                         int pid;
 333                         int tag = 2001;
 334                         MPI_Status status;
 335                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
 336                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 337
 338                         if (pid == 0) { //only process 0 converts and splits
 339
 340                 #endif
 341
 342                 //if user gave a phylip file convert to column file
 343                 if (format == "phylip") {
 344                         estart = time(NULL);
 345                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
 346
 347                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
 348
 349                         NameAssignment* nameMap = NULL;
 350                         convert->setFormat("phylip");
 351                         convert->read(nameMap);
 352
 353                         if (m->control_pressed) {  delete convert;  return 0;  }
 354
 355                         distfile = convert->getOutputFile();
 356
 357                         //if no names file given with phylip file, create it
 358                         ListVector* listToMakeNameFile =  convert->getListVector();
 359                         if (namefile == "") {  //you need to make a namefile for split matrix
 360                                 ofstream out;
 361                                 namefile = phylipfile + ".names";
 362                                 m->openOutputFile(namefile, out);
 363                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
 364                                         string bin = listToMakeNameFile->get(i);
 365                                         out << bin << '\t' << bin << endl;
 366                                 }
 367                                 out.close();
 368                         }
 369                         delete listToMakeNameFile;
 370                         delete convert;
 371
 372                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
 373                 }
 374                 if (m->control_pressed) { return 0; }
 375
 376                 estart = time(NULL);
 377                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
 378
 379                 //split matrix into non-overlapping groups
 380                 SplitMatrix* split;
 381                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
 382                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
 383                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);     }
 384                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
 385
 386                 split->split();
 387
 388                 if (m->control_pressed) { delete split; return 0; }
 389
 390                 singletonName = split->getSingletonNames();
 391                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
 392                 delete split;
 393
 394                 //output a merged distance file
 395                 if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
 396
 397
 398                 if (m->control_pressed) { return 0; }
 399
 400                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
 401                 estart = time(NULL);
 402
 403                 //****************** break up files between processes and cluster each file set ******************************//
 404         #ifdef USE_MPI
 405                         ////you are process 0 from above////
 406
 407                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 408                         dividedNames.resize(processors);
 409
 410                         //for each file group figure out which process will complete it
 411                         //want to divide the load intelligently so the big files are spread between processes
 412                         for (int i = 0; i < distName.size(); i++) {
 413                                 int processToAssign = (i+1) % processors;
 414                                 if (processToAssign == 0) { processToAssign = processors; }
 415
 416                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 417                         }
 418
 419                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 420                         for (int i = 0; i < processors; i++) {
 421                                 int remainder = ((i+1) % processors);
 422                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 423                         }
 424
 425
 426                         //send each child the list of files it needs to process
 427                         for(int i = 1; i < processors; i++) {
 428                                 //send number of file pairs
 429                                 int num = dividedNames[i].size();
 430                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 431
 432                                 for (int j = 0; j < num; j++) { //send filenames to process i
 433                                         char tempDistFileName[1024];
 434                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
 435                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
 436
 437                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 438                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 439
 440                                         char tempNameFileName[1024];
 441                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
 442                                         int lengthName = (dividedNames[i][j].begin()->second).length();
 443
 444                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 445                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 446                                 }
 447                         }
 448
 449                         //process your share
 450                         listFileNames = cluster(dividedNames[0], labels);
 451
 452                         //receive the other processes info
 453                         for(int i = 1; i < processors; i++) {
 454                                 int num = dividedNames[i].size();
 455
 456                                 double tempCutoff;
 457                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
 458                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
 459
 460                                 //send list filenames to root process
 461                                 for (int j = 0; j < num; j++) {
 462                                         int lengthList = 0;
 463                                         char tempListFileName[1024];
 464
 465                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 466                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 467
 468                                         string myListFileName = tempListFileName;
 469                                         myListFileName = myListFileName.substr(0, lengthList);
 470
 471                                         listFileNames.push_back(myListFileName);
 472                                 }
 473
 474                                 //send Labels to root process
 475                                 int numLabels = 0;
 476                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 477
 478                                 for (int j = 0; j < numLabels; j++) {
 479                                         int lengthLabel = 0;
 480                                         char tempLabel[100];
 481
 482                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 483                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 484
 485                                         string myLabel = tempLabel;
 486                                         myLabel = myLabel.substr(0, lengthLabel);
 487
 488                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
 489                                 }
 490                         }
 491
 492                 }else { //you are a child process
 493                         vector < map<string, string> >  myNames;
 494
 495                         //recieve the files you need to process
 496                         //receive number of file pairs
 497                         int num = 0;
 498                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 499
 500                         myNames.resize(num);
 501
 502                         for (int j = 0; j < num; j++) { //receive filenames to process
 503                                 int lengthDist = 0;
 504                                 char tempDistFileName[1024];
 505
 506                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 507                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 508
 509                                 string myDistFileName = tempDistFileName;
 510                                 myDistFileName = myDistFileName.substr(0, lengthDist);
 511
 512                                 int lengthName = 0;
 513                                 char tempNameFileName[1024];
 514
 515                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 516                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 517
 518                                 string myNameFileName = tempNameFileName;
 519                                 myNameFileName = myNameFileName.substr(0, lengthName);
 520
 521                                 //save file name
 522                                 myNames[j][myDistFileName] = myNameFileName;
 523                         }
 524
 525                         //process them
 526                         listFileNames = cluster(myNames, labels);
 527
 528                         //send cutoff
 529                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
 530
 531                         //send list filenames to root process
 532                         for (int j = 0; j < num; j++) {
 533                                 char tempListFileName[1024];
 534                                 strcpy(tempListFileName, listFileNames[j].c_str());
 535                                 int lengthList = listFileNames[j].length();
 536
 537                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 538                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 539                         }
 540
 541                         //send Labels to root process
 542                         int numLabels = labels.size();
 543                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 544
 545                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
 546                                 char tempLabel[100];
 547                                 strcpy(tempLabel, (*it).c_str());
 548                                 int lengthLabel = (*it).length();
 549
 550                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 551                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 552                         }
 553                 }
 554
 555                 //make everyone wait
 556                 MPI_Barrier(MPI_COMM_WORLD);
 557
 558         #else
 559                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
 560                 //sanity check
 561                 if (processors > distName.size()) { processors = distName.size(); }
 562
 563                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 564                                 if(processors == 1){
 565                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 566                                 }else{
 567                                         listFileNames = createProcesses(distName, labels);
 568                 }
 569                 #else
 570                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 571                 #endif
 572         #endif
 573                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
 574
 575                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
 576
 577                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
 578
 579                 //****************** merge list file and create rabund and sabund files ******************************//
 580                 estart = time(NULL);
 581                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
 582
 583                 #ifdef USE_MPI
 584                         if (pid == 0) { //only process 0 merges
 585                 #endif
 586
 587                 ListVector* listSingle;
 588                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 589
 590                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 591
 592                 mergeLists(listFileNames, labelBins, listSingle);
 593
 594                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 595
 596                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 597
 598                 //set list file as new current listfile
 599                 string current = "";
 600                 itTypes = outputTypes.find("list");
 601                 if (itTypes != outputTypes.end()) {
 602                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
 603                 }
 604
 605                 //set rabund file as new current rabundfile
 606                 itTypes = outputTypes.find("rabund");
 607                 if (itTypes != outputTypes.end()) {
 608                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
 609                 }
 610
 611                 //set sabund file as new current sabundfile
 612                 itTypes = outputTypes.find("sabund");
 613                 if (itTypes != outputTypes.end()) {
 614                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
 615                 }
 616
 617                 m->mothurOutEndLine();
 618                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 619                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
 620                 m->mothurOutEndLine();
 621
 622                 #ifdef USE_MPI
 623                         } //only process 0 merges
 624
 625                         //make everyone wait
 626                         MPI_Barrier(MPI_COMM_WORLD);
 627                 #endif
 628
 629                 return 0;
 630         }
 631         catch(exception& e) {
 632                 m->errorOut(e, "ClusterSplitCommand", "execute");
 633                 exit(1);
 634         }
 635 }
 636 //**********************************************************************************************************************
 637 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
 638         try {
 639
 640                 map<float, int> labelBin;
 641                 vector<float> orderFloat;
 642                 int numSingleBins;
 643
 644                 //read in singletons
 645                 if (singleton != "none") {
 646                         ifstream in;
 647                         m->openInputFile(singleton, in);
 648
 649                         string firstCol, secondCol;
 650                         listSingle = new ListVector();
 651                         while (!in.eof()) {
 652                                 in >> firstCol >> secondCol; m->gobble(in);
 653                                 listSingle->push_back(secondCol);
 654                         }
 655                         in.close();
 656                         m->mothurRemove(singleton);
 657
 658                         numSingleBins = listSingle->getNumBins();
 659                 }else{  listSingle = NULL; numSingleBins = 0;  }
 660
 661                 //go through users set and make them floats so we can sort them
 662                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
 663                         float temp = -10.0;
 664
 665                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
 666                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
 667
 668                         if (temp <= cutoff) {
 669                                 orderFloat.push_back(temp);
 670                                 labelBin[temp] = numSingleBins; //initialize numbins
 671                         }
 672                 }
 673
 674                 //sort order
 675                 sort(orderFloat.begin(), orderFloat.end());
 676                 userLabels.clear();
 677
 678                 //get the list info from each file
 679                 for (int k = 0; k < listNames.size(); k++) {
 680
 681                         if (m->control_pressed) {
 682                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
 683                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
 684                                 return labelBin;
 685                         }
 686
 687                         InputData* input = new InputData(listNames[k], "list");
 688                         ListVector* list = input->getListVector();
 689                         string lastLabel = list->getLabel();
 690
 691                         string filledInList = listNames[k] + "filledInTemp";
 692                         ofstream outFilled;
 693                         m->openOutputFile(filledInList, outFilled);
 694
 695                         //for each label needed
 696                         for(int l = 0; l < orderFloat.size(); l++){
 697
 698                                 string thisLabel;
 699                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
 700                                 else { thisLabel = toString(orderFloat[l],  length-1);  }
 701
 702                                 //this file has reached the end
 703                                 if (list == NULL) {
 704                                         list = input->getListVector(lastLabel, true);
 705                                 }else{  //do you have the distance, or do you need to fill in
 706
 707                                         float labelFloat;
 708                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
 709                                         else { convert(list->getLabel(), labelFloat); }
 710
 711                                         //check for missing labels
 712                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
 713                                                 //if its bigger get last label, otherwise keep it
 714                                                 delete list;
 715                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
 716                                         }
 717                                         lastLabel = list->getLabel();
 718                                 }
 719
 720                                 //print to new file
 721                                 list->setLabel(thisLabel);
 722                                 list->print(outFilled);
 723
 724                                 //update labelBin
 725                                 labelBin[orderFloat[l]] += list->getNumBins();
 726
 727                                 delete list;
 728
 729                                 list = input->getListVector();
 730                         }
 731
 732                         if (list != NULL) { delete list; }
 733                         delete input;
 734
 735                         outFilled.close();
 736                         m->mothurRemove(listNames[k]);
 737                         rename(filledInList.c_str(), listNames[k].c_str());
 738                 }
 739
 740                 return labelBin;
 741         }
 742         catch(exception& e) {
 743                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
 744                 exit(1);
 745         }
 746 }
 747 //**********************************************************************************************************************
 748 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
 749         try {
 750                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
 751                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
 752
 753                 m->openOutputFile(fileroot+ tag + ".sabund",    outSabund);
 754                 m->openOutputFile(fileroot+ tag + ".rabund",    outRabund);
 755                 m->openOutputFile(fileroot+ tag + ".list",              outList);
 756
 757                 outputNames.push_back(fileroot+ tag + ".sabund");  outputTypes["list"].push_back(fileroot+ tag + ".list");
 758                 outputNames.push_back(fileroot+ tag + ".rabund");  outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
 759                 outputNames.push_back(fileroot+ tag + ".list");    outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
 760
 761                 map<float, int>::iterator itLabel;
 762
 763                 //for each label needed
 764                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
 765
 766                         string thisLabel;
 767                         if (itLabel->first == -1) { thisLabel = "unique"; }
 768                         else { thisLabel = toString(itLabel->first,  length-1);  }
 769
 770                         outList << thisLabel << '\t' << itLabel->second << '\t';
 771
 772                         RAbundVector* rabund = new RAbundVector();
 773                         rabund->setLabel(thisLabel);
 774
 775                         //add in singletons
 776                         if (listSingle != NULL) {
 777                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
 778                                         outList << listSingle->get(j) << '\t';
 779                                         rabund->push_back(m->getNumNames(listSingle->get(j)));
 780                                 }
 781                         }
 782
 783                         //get the list info from each file
 784                         for (int k = 0; k < listNames.size(); k++) {
 785
 786                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } delete rabund; return 0; }
 787
 788                                 InputData* input = new InputData(listNames[k], "list");
 789                                 ListVector* list = input->getListVector(thisLabel);
 790
 791                                 //this file has reached the end
 792                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }
 793                                 else {
 794                                         for (int j = 0; j < list->getNumBins(); j++) {
 795                                                 outList << list->get(j) << '\t';
 796                                                 rabund->push_back(m->getNumNames(list->get(j)));
 797                                         }
 798                                         delete list;
 799                                 }
 800                                 delete input;
 801                         }
 802
 803                         SAbundVector sabund = rabund->getSAbundVector();
 804
 805                         sabund.print(outSabund);
 806                         rabund->print(outRabund);
 807                         outList << endl;
 808
 809                         delete rabund;
 810                 }
 811
 812                 outList.close();
 813                 outRabund.close();
 814                 outSabund.close();
 815
 816                 if (listSingle != NULL) { delete listSingle;  }
 817
 818                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
 819
 820                 return 0;
 821         }
 822         catch(exception& e) {
 823                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
 824                 exit(1);
 825         }
 826 }
 827
 828 //**********************************************************************************************************************
 829
 830 void ClusterSplitCommand::printData(ListVector* oldList){
 831         try {
 832                 string label = oldList->getLabel();
 833                 RAbundVector oldRAbund = oldList->getRAbundVector();
 834
 835                 oldRAbund.setLabel(label);
 836                 if (m->isTrue(showabund)) {
 837                         oldRAbund.getSAbundVector().print(cout);
 838                 }
 839                 oldRAbund.print(outRabund);
 840                 oldRAbund.getSAbundVector().print(outSabund);
 841
 842                 oldList->print(outList);
 843         }
 844         catch(exception& e) {
 845                 m->errorOut(e, "ClusterSplitCommand", "printData");
 846                 exit(1);
 847         }
 848 }
 849 //**********************************************************************************************************************
 850 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
 851         try {
 852
 853         vector<string> listFiles;
 854         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 855         dividedNames.resize(processors);
 856
 857         //for each file group figure out which process will complete it
 858         //want to divide the load intelligently so the big files are spread between processes
 859         for (int i = 0; i < distName.size(); i++) {
 860             //cout << i << endl;
 861             int processToAssign = (i+1) % processors;
 862             if (processToAssign == 0) { processToAssign = processors; }
 863
 864             dividedNames[(processToAssign-1)].push_back(distName[i]);
 865             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
 866         }
 867
 868         //not lets reverse the order of ever other process, so we balance big files running with little ones
 869         for (int i = 0; i < processors; i++) {
 870             //cout << i << endl;
 871             int remainder = ((i+1) % processors);
 872             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 873         }
 874
 875         if (m->control_pressed) { return listFiles; }
 876
 877         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 878                 int process = 1;
 879                 processIDS.clear();
 880
 881                 //loop through and create all the processes you want
 882                 while (process != processors) {
 883                         int pid = fork();
 884
 885                         if (pid > 0) {
 886                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 887                                 process++;
 888                         }else if (pid == 0){
 889                                 set<string> labels;
 890                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
 891
 892                                 //write out names to file
 893                                 string filename = toString(getpid()) + ".temp";
 894                                 ofstream out;
 895                                 m->openOutputFile(filename, out);
 896                                 out << tag << endl;
 897                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
 898                                 out.close();
 899
 900                                 //print out labels
 901                                 ofstream outLabels;
 902                                 filename = toString(getpid()) + ".temp.labels";
 903                                 m->openOutputFile(filename, outLabels);
 904
 905                                 outLabels << cutoff << endl;
 906                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
 907                                         outLabels << (*it) << endl;
 908                                 }
 909                                 outLabels.close();
 910
 911                                 exit(0);
 912                         }else {
 913                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
 914                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
 915                                 exit(0);
 916                         }
 917                 }
 918
 919         //do your part
 920         listFiles = cluster(dividedNames[0], labels);
 921
 922                 //force parent to wait until all the processes are done
 923                 for (int i=0;i< processIDS.size();i++) {
 924                         int temp = processIDS[i];
 925                         wait(&temp);
 926                 }
 927
 928         //get list of list file names from each process
 929         for(int i=0;i<processIDS.size();i++){
 930             string filename = toString(processIDS[i]) + ".temp";
 931             ifstream in;
 932             m->openInputFile(filename, in);
 933
 934             in >> tag; m->gobble(in);
 935
 936             while(!in.eof()) {
 937                 string tempName;
 938                 in >> tempName; m->gobble(in);
 939                 listFiles.push_back(tempName);
 940             }
 941             in.close();
 942             m->mothurRemove((toString(processIDS[i]) + ".temp"));
 943
 944             //get labels
 945             filename = toString(processIDS[i]) + ".temp.labels";
 946             ifstream in2;
 947             m->openInputFile(filename, in2);
 948
 949             float tempCutoff;
 950             in2 >> tempCutoff; m->gobble(in2);
 951             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
 952
 953             while(!in2.eof()) {
 954                 string tempName;
 955                 in2 >> tempName; m->gobble(in2);
 956                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
 957             }
 958             in2.close();
 959             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
 960         }
 961
 962
 963     #else
 964
 965         //////////////////////////////////////////////////////////////////////////////////////////////////////
 966                 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
 967                 //Above fork() will clone, so memory is separate, but that's not the case with windows,
 968                 //Taking advantage of shared memory to allow both threads to add labels.
 969                 //////////////////////////////////////////////////////////////////////////////////////////////////////
 970
 971                 vector<clusterData*> pDataArray;
 972                 DWORD   dwThreadIdArray[processors-1];
 973                 HANDLE  hThreadArray[processors-1];
 974
 975                 //Create processor worker threads.
 976                 for( int i=1; i<processors; i++ ){
 977                         // Allocate memory for thread data.
 978                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
 979                         pDataArray.push_back(tempCluster);
 980                         processIDS.push_back(i);
 981
 982                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
 983                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
 984                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
 985
 986                 }
 987
 988         //do your part
 989         listFiles = cluster(dividedNames[0], labels);
 990
 991                 //Wait until all threads have terminated.
 992                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
 993
 994                 //Close all thread handles and free memory allocations.
 995                 for(int i=0; i < pDataArray.size(); i++){
 996             //get tag
 997             tag = pDataArray[i]->tag;
 998             //get listfiles created
 999             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1000             //get labels
1001             set<string>::iterator it;
1002             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1003                         //check cutoff
1004             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1005                         CloseHandle(hThreadArray[i]);
1006                         delete pDataArray[i];
1007                 }
1008
1009         #endif
1010
1011         return listFiles;
1012
1013         }
1014         catch(exception& e) {
1015                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1016                 exit(1);
1017         }
1018 }
1019 //**********************************************************************************************************************
1020
1021 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1022         try {
1023
1024                 vector<string> listFileNames;
1025                 double smallestCutoff = cutoff;
1026
1027                 //cluster each distance file
1028                 for (int i = 0; i < distNames.size(); i++) {
1029
1030                         string thisNamefile = distNames[i].begin()->second;
1031                         string thisDistFile = distNames[i].begin()->first;
1032
1033                         string listFileName = "";
1034             if (classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff);   }
1035             else            {  listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff);          }
1036
1037                         if (m->control_pressed) { //clean up
1038                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1039                                 listFileNames.clear(); return listFileNames;
1040                         }
1041
1042             listFileNames.push_back(listFileName);
1043         }
1044
1045                 cutoff = smallestCutoff;
1046
1047                 return listFileNames;
1048
1049         }
1050         catch(exception& e) {
1051                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1052                 exit(1);
1053         }
1054
1055
1056 }
1057 //**********************************************************************************************************************
1058 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1059         try {
1060         string listFileName = "";
1061
1062         ListVector* list = NULL;
1063         ListVector oldList;
1064         RAbundVector* rabund = NULL;
1065
1066 #ifdef USE_MPI
1067         int pid;
1068         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1069
1070         //output your files too
1071         if (pid != 0) {
1072             cout << endl << "Reading " << thisDistFile << endl;
1073         }
1074 #endif
1075
1076         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1077
1078         NameAssignment* nameMap = new NameAssignment(thisNamefile);
1079         nameMap->readMap();
1080
1081                 //reads phylip file storing data in 2D vector, also fills list and rabund
1082         bool sim = false;
1083                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1084                 cluster->readPhylipFile(thisDistFile, nameMap);
1085                 tag = cluster->getTag();
1086
1087                 if (m->control_pressed) { delete cluster; return 0; }
1088
1089                 list = cluster->getListVector();
1090                 rabund = cluster->getRAbundVector();
1091
1092                 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1093                 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1094         listFileName = fileroot+ tag + ".list";
1095
1096         ofstream listFile;
1097                 m->openOutputFile(fileroot+ tag + ".list",      listFile);
1098
1099                 float previousDist = 0.00000;
1100                 float rndPreviousDist = 0.00000;
1101                 oldList = *list;
1102
1103 #ifdef USE_MPI
1104         //output your files too
1105         if (pid != 0) {
1106             cout << endl << "Clustering " << thisDistFile << endl;
1107         }
1108 #endif
1109
1110         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1111
1112                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1113                         if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  return listFileName;  }
1114
1115                         cluster->update(cutoff);
1116
1117                         float dist = cluster->getSmallDist();
1118                         float rndDist;
1119                         if (hard) {
1120                                 rndDist = m->ceilDist(dist, precision);
1121                         }else{
1122                                 rndDist = m->roundDist(dist, precision);
1123                         }
1124
1125             if(previousDist <= 0.0000 && dist != previousDist){
1126                 oldList.setLabel("unique");
1127                 oldList.print(listFile);
1128                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1129             }
1130             else if(rndDist != rndPreviousDist){
1131                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1132                 oldList.print(listFile);
1133                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1134             }
1135
1136
1137                         previousDist = dist;
1138                         rndPreviousDist = rndDist;
1139                         oldList = *list;
1140                 }
1141
1142                 if(previousDist <= 0.0000){
1143             oldList.setLabel("unique");
1144             oldList.print(listFile);
1145             if (labels.count("unique") == 0) { labels.insert("unique"); }
1146         }
1147         else if(rndPreviousDist<cutoff){
1148             oldList.setLabel(toString(rndPreviousDist,  length-1));
1149             oldList.print(listFile);
1150             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1151         }
1152
1153
1154                 listFile.close();
1155
1156                 delete cluster; delete nameMap; delete list; delete rabund;
1157
1158
1159         return listFileName;
1160
1161         }
1162         catch(exception& e) {
1163                 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1164                 exit(1);
1165         }
1166 }
1167
1168 //**********************************************************************************************************************
1169 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1170         try {
1171         string listFileName = "";
1172
1173         Cluster* cluster = NULL;
1174         SparseMatrix* matrix = NULL;
1175         ListVector* list = NULL;
1176         ListVector oldList;
1177         RAbundVector* rabund = NULL;
1178
1179         if (m->control_pressed) { return listFileName; }
1180
1181 #ifdef USE_MPI
1182         int pid;
1183         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1184
1185         //output your files too
1186         if (pid != 0) {
1187             cout << endl << "Reading " << thisDistFile << endl;
1188         }
1189 #endif
1190
1191         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1192
1193         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1194         read->setCutoff(cutoff);
1195
1196         NameAssignment* nameMap = new NameAssignment(thisNamefile);
1197         nameMap->readMap();
1198         read->read(nameMap);
1199
1200         if (m->control_pressed) {  delete read; delete nameMap; return listFileName; }
1201
1202         list = read->getListVector();
1203         oldList = *list;
1204         matrix = read->getMatrix();
1205
1206         delete read;  read = NULL;
1207         delete nameMap; nameMap = NULL;
1208
1209
1210 #ifdef USE_MPI
1211         //output your files too
1212         if (pid != 0) {
1213             cout << endl << "Clustering " << thisDistFile << endl;
1214         }
1215 #endif
1216
1217         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1218
1219         rabund = new RAbundVector(list->getRAbundVector());
1220
1221         //create cluster
1222         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1223         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1224         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
1225         tag = cluster->getTag();
1226
1227         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1228         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1229
1230         ofstream listFile;
1231         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1232
1233         listFileName = fileroot+ tag + ".list";
1234
1235         float previousDist = 0.00000;
1236         float rndPreviousDist = 0.00000;
1237
1238         oldList = *list;
1239
1240         print_start = true;
1241         start = time(NULL);
1242         double saveCutoff = cutoff;
1243
1244         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1245
1246             if (m->control_pressed) { //clean up
1247                 delete matrix; delete list;     delete cluster; delete rabund;
1248                 listFile.close();
1249                 m->mothurRemove(listFileName);
1250                 return listFileName;
1251             }
1252
1253             cluster->update(saveCutoff);
1254
1255             float dist = matrix->getSmallDist();
1256             float rndDist;
1257             if (hard) {
1258                 rndDist = m->ceilDist(dist, precision);
1259             }else{
1260                 rndDist = m->roundDist(dist, precision);
1261             }
1262
1263             if(previousDist <= 0.0000 && dist != previousDist){
1264                 oldList.setLabel("unique");
1265                 oldList.print(listFile);
1266                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1267             }
1268             else if(rndDist != rndPreviousDist){
1269                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1270                 oldList.print(listFile);
1271                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1272             }
1273
1274             previousDist = dist;
1275             rndPreviousDist = rndDist;
1276             oldList = *list;
1277         }
1278
1279
1280         if(previousDist <= 0.0000){
1281             oldList.setLabel("unique");
1282             oldList.print(listFile);
1283             if (labels.count("unique") == 0) { labels.insert("unique"); }
1284         }
1285         else if(rndPreviousDist<cutoff){
1286             oldList.setLabel(toString(rndPreviousDist,  length-1));
1287             oldList.print(listFile);
1288             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1289         }
1290
1291         delete matrix; delete list;     delete cluster; delete rabund;
1292         matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1293         listFile.close();
1294
1295         if (m->control_pressed) { //clean up
1296             m->mothurRemove(listFileName);
1297             return listFileName;
1298         }
1299
1300         m->mothurRemove(thisDistFile);
1301         m->mothurRemove(thisNamefile);
1302
1303         if (saveCutoff != cutoff) {
1304             if (hard)   {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1305             else                {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1306
1307             m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1308         }
1309
1310         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1311
1312         return listFileName;
1313
1314         }
1315         catch(exception& e) {
1316                 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1317                 exit(1);
1318         }
1319 }
1320 //**********************************************************************************************************************
1321
1322 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1323         try{
1324
1325 #ifdef USE_MPI
1326                 int pid;
1327                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1328
1329                 if (pid != 0) {
1330 #endif
1331
1332                 string thisOutputDir = outputDir;
1333                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1334                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
1335                 m->mothurRemove(outputFileName);
1336
1337
1338                 for (int i = 0; i < distNames.size(); i++) {
1339                         if (m->control_pressed) {  return 0; }
1340
1341                         string thisDistFile = distNames[i].begin()->first;
1342
1343                         m->appendFiles(thisDistFile, outputFileName);
1344                 }
1345
1346                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1347
1348 #ifdef USE_MPI
1349                 }
1350 #endif
1351
1352                 return 0;
1353
1354
1355         }
1356         catch(exception& e) {
1357                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1358                 exit(1);
1359         }
1360 }
1361 //**********************************************************************************************************************