clustersplitcommand.cpp

   1 /*
   2  *  clustersplitcommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 5/19/10.
   6  *  Copyright 2010 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "clustersplitcommand.h"
  11
  12
  13 //**********************************************************************************************************************
  14 vector<string> ClusterSplitCommand::setParameters(){
  15         try {
  16                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
  17                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
  18                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
  19                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
  20         CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "",false,false); parameters.push_back(pcount);
  21                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
  22                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
  23                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
  24                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
  25                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
  26         CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pcluster);
  27                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
  28                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
  29                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff);
  30                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
  31                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
  32                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
  33         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pclassic);
  34                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
  35                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
  36
  37                 vector<string> myArray;
  38                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
  39                 return myArray;
  40         }
  41         catch(exception& e) {
  42                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
  43                 exit(1);
  44         }
  45 }
  46 //**********************************************************************************************************************
  47 string ClusterSplitCommand::getHelpString(){
  48         try {
  49                 string helpString = "";
  50                 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
  51                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
  52                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
  53                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
  54                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
  55                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
  56                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
  57                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
  58                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
  59                 helpString += "The name parameter allows you to enter your name file. \n";
  60         helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
  61         helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
  62                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
  63                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
  64                 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
  65                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
  66                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
  67                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
  68                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
  69         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic.  It is only valid with splitmethod=fasta. Default=f.\n";
  70 #ifdef USE_MPI
  71                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
  72 #endif
  73                 helpString += "The cluster.split command should be in the following format: \n";
  74                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
  75                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
  76                 return helpString;
  77         }
  78         catch(exception& e) {
  79                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
  80                 exit(1);
  81         }
  82 }
  83 //**********************************************************************************************************************
  84 string ClusterSplitCommand::getOutputFileNameTag(string type, string inputName=""){
  85         try {
  86         string outputFileName = "";
  87                 map<string, vector<string> >::iterator it;
  88
  89         //is this a type this command creates
  90         it = outputTypes.find(type);
  91         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
  92         else {
  93             if (type == "list") {  outputFileName =  "list"; }
  94             else if (type == "rabund") {  outputFileName =  "rabund"; }
  95             else if (type == "sabund") {  outputFileName =  "sabund"; }
  96             else if (type == "column") {  outputFileName =  "dist"; }
  97             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
  98         }
  99         return outputFileName;
 100         }
 101         catch(exception& e) {
 102                 m->errorOut(e, "ClusterSplitCommand", "getOutputFileNameTag");
 103                 exit(1);
 104         }
 105 }
 106 //**********************************************************************************************************************
 107 ClusterSplitCommand::ClusterSplitCommand(){
 108         try {
 109                 abort = true; calledHelp = true;
 110                 setParameters();
 111                 vector<string> tempOutNames;
 112                 outputTypes["list"] = tempOutNames;
 113                 outputTypes["rabund"] = tempOutNames;
 114                 outputTypes["sabund"] = tempOutNames;
 115                 outputTypes["column"] = tempOutNames;
 116         }
 117         catch(exception& e) {
 118                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 119                 exit(1);
 120         }
 121 }
 122 //**********************************************************************************************************************
 123 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
 124 ClusterSplitCommand::ClusterSplitCommand(string option)  {
 125         try{
 126                 abort = false; calledHelp = false;
 127                 format = "";
 128
 129                 //allow user to run help
 130                 if(option == "help") { help(); abort = true; calledHelp = true; }
 131                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
 132
 133                 else {
 134                         vector<string> myArray = setParameters();
 135
 136                         OptionParser parser(option);
 137                         map<string,string> parameters = parser.getParameters();
 138
 139                         ValidParameters validParameter("cluster.split");
 140
 141                         //check to make sure all parameters are valid for command
 142                         map<string,string>::iterator it;
 143                         for (it = parameters.begin(); it != parameters.end(); it++) {
 144                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
 145                                         abort = true;
 146                                 }
 147                         }
 148
 149                         //initialize outputTypes
 150                         vector<string> tempOutNames;
 151                         outputTypes["list"] = tempOutNames;
 152                         outputTypes["rabund"] = tempOutNames;
 153                         outputTypes["sabund"] = tempOutNames;
 154                         outputTypes["column"] = tempOutNames;
 155
 156                         //if the user changes the output directory command factory will send this info to us in the output parameter
 157                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
 158
 159                                 //if the user changes the input directory command factory will send this info to us in the output parameter
 160                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
 161                         if (inputDir == "not found"){   inputDir = "";          }
 162                         else {
 163                                 string path;
 164                                 it = parameters.find("phylip");
 165                                 //user has given a template file
 166                                 if(it != parameters.end()){
 167                                         path = m->hasPath(it->second);
 168                                         //if the user has not given a path then, add inputdir. else leave path alone.
 169                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
 170                                 }
 171
 172                                 it = parameters.find("column");
 173                                 //user has given a template file
 174                                 if(it != parameters.end()){
 175                                         path = m->hasPath(it->second);
 176                                         //if the user has not given a path then, add inputdir. else leave path alone.
 177                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
 178                                 }
 179
 180                                 it = parameters.find("name");
 181                                 //user has given a template file
 182                                 if(it != parameters.end()){
 183                                         path = m->hasPath(it->second);
 184                                         //if the user has not given a path then, add inputdir. else leave path alone.
 185                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
 186                                 }
 187
 188                                 it = parameters.find("taxonomy");
 189                                 //user has given a template file
 190                                 if(it != parameters.end()){
 191                                         path = m->hasPath(it->second);
 192                                         //if the user has not given a path then, add inputdir. else leave path alone.
 193                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
 194                                 }
 195
 196                                 it = parameters.find("fasta");
 197                                 //user has given a template file
 198                                 if(it != parameters.end()){
 199                                         path = m->hasPath(it->second);
 200                                         //if the user has not given a path then, add inputdir. else leave path alone.
 201                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
 202                                 }
 203
 204                 it = parameters.find("count");
 205                                 //user has given a template file
 206                                 if(it != parameters.end()){
 207                                         path = m->hasPath(it->second);
 208                                         //if the user has not given a path then, add inputdir. else leave path alone.
 209                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
 210                                 }
 211                         }
 212
 213                         //check for required parameters
 214                         phylipfile = validParameter.validFile(parameters, "phylip", true);
 215                         if (phylipfile == "not open") { abort = true; }
 216                         else if (phylipfile == "not found") { phylipfile = ""; }
 217                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
 218
 219                         columnfile = validParameter.validFile(parameters, "column", true);
 220                         if (columnfile == "not open") { abort = true; }
 221                         else if (columnfile == "not found") { columnfile = ""; }
 222                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
 223
 224                         namefile = validParameter.validFile(parameters, "name", true);
 225                         if (namefile == "not open") { abort = true; namefile = "";}
 226                         else if (namefile == "not found") { namefile = "";  }
 227                         else { m->setNameFile(namefile); }
 228
 229             countfile = validParameter.validFile(parameters, "count", true);
 230                         if (countfile == "not open") { abort = true; countfile = "";}
 231                         else if (countfile == "not found") { countfile = "";  }
 232                         else { m->setCountTableFile(countfile); }
 233
 234                         fastafile = validParameter.validFile(parameters, "fasta", true);
 235                         if (fastafile == "not open") { abort = true; }
 236                         else if (fastafile == "not found") { fastafile = ""; }
 237                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
 238
 239                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
 240                         if (taxFile == "not open") { taxFile = ""; abort = true; }
 241                         else if (taxFile == "not found") { taxFile = ""; }
 242                         else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
 243
 244                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
 245                                 //is there are current file available for either of these?
 246                                 //give priority to column, then phylip, then fasta
 247                                 columnfile = m->getColumnFile();
 248                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
 249                                 else {
 250                                         phylipfile = m->getPhylipFile();
 251                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
 252                                         else {
 253                                                 fastafile = m->getFastaFile();
 254                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
 255                                                 else {
 256                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
 257                                                         abort = true;
 258                                                 }
 259                                         }
 260                                 }
 261                         }
 262                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
 263
 264             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
 265
 266                         if (columnfile != "") {
 267                                 if ((namefile == "") && (countfile == "")) {
 268                                         namefile = m->getNameFile();
 269                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 270                                         else {
 271                                                 countfile = m->getCountTableFile();
 272                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
 273                         else {
 274                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
 275                             abort = true;
 276                         }
 277                                         }
 278                                 }
 279                         }
 280
 281                         if (fastafile != "") {
 282                                 if (taxFile == "") {
 283                                         taxFile = m->getTaxonomyFile();
 284                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
 285                                         else {
 286                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
 287                                                 abort = true;
 288                                         }
 289                                 }
 290
 291                                 if ((namefile == "") && (countfile == "")) {
 292                                         namefile = m->getNameFile();
 293                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 294                                         else {
 295                                                 countfile = m->getCountTableFile();
 296                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
 297                         else {
 298                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine();
 299                             abort = true;
 300                         }
 301                                         }
 302                                 }
 303                         }
 304
 305                         //check for optional parameter and set defaults
 306                         // ...at some point should added some additional type checking...
 307                         //get user cutoff and precision or use defaults
 308                         string temp;
 309                         temp = validParameter.validFile(parameters, "precision", false);
 310                         if (temp == "not found") { temp = "100"; }
 311                         //saves precision legnth for formatting below
 312                         length = temp.length();
 313                         m->mothurConvert(temp, precision);
 314
 315                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
 316                         hard = m->isTrue(temp);
 317
 318                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
 319                         large = m->isTrue(temp);
 320
 321                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
 322                         m->setProcessors(temp);
 323                         m->mothurConvert(temp, processors);
 324
 325                         temp = validParameter.validFile(parameters, "splitmethod", false);
 326                         if ((splitmethod != "fasta") && (splitmethod != "classify")) {
 327                                 if (temp == "not found")  { splitmethod = "distance"; }
 328                                 else {  splitmethod = temp; }
 329                         }
 330
 331             temp = validParameter.validFile(parameters, "classic", false);                      if (temp == "not found") { temp = "F"; }
 332                         classic = m->isTrue(temp);
 333
 334             if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
 335
 336                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
 337                         m->mothurConvert(temp, cutoff);
 338                         cutoff += (5 / (precision * 10.0));
 339
 340                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
 341                         m->mothurConvert(temp, taxLevelCutoff);
 342
 343                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
 344
 345                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
 346                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 347
 348                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
 349                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
 350
 351                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
 352
 353                         showabund = validParameter.validFile(parameters, "showabund", false);
 354                         if (showabund == "not found") { showabund = "T"; }
 355
 356             temp = validParameter.validFile(parameters, "cluster", false);  if (temp == "not found") { temp = "T"; }
 357             runCluster = m->isTrue(temp);
 358
 359                         timing = validParameter.validFile(parameters, "timing", false);
 360                         if (timing == "not found") { timing = "F"; }
 361
 362                 }
 363         }
 364         catch(exception& e) {
 365                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 366                 exit(1);
 367         }
 368 }
 369
 370 //**********************************************************************************************************************
 371
 372 int ClusterSplitCommand::execute(){
 373         try {
 374
 375                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
 376
 377                 time_t estart;
 378                 vector<string> listFileNames;
 379                 set<string> labels;
 380                 string singletonName = "";
 381                 double saveCutoff = cutoff;
 382
 383                 //****************** file prep work ******************************//
 384                 #ifdef USE_MPI
 385                         int pid;
 386                         int tag = 2001;
 387                         MPI_Status status;
 388                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
 389                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 390
 391                         if (pid == 0) { //only process 0 converts and splits
 392
 393                 #endif
 394
 395                 //if user gave a phylip file convert to column file
 396                 if (format == "phylip") {
 397                         estart = time(NULL);
 398                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
 399
 400                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
 401
 402                         NameAssignment* nameMap = NULL;
 403                         convert->setFormat("phylip");
 404                         convert->read(nameMap);
 405
 406                         if (m->control_pressed) {  delete convert;  return 0;  }
 407
 408                         distfile = convert->getOutputFile();
 409
 410                         //if no names file given with phylip file, create it
 411                         ListVector* listToMakeNameFile =  convert->getListVector();
 412                         if ((namefile == "") && (countfile == "")) {  //you need to make a namefile for split matrix
 413                                 ofstream out;
 414                                 namefile = phylipfile + ".names";
 415                                 m->openOutputFile(namefile, out);
 416                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
 417                                         string bin = listToMakeNameFile->get(i);
 418                                         out << bin << '\t' << bin << endl;
 419                                 }
 420                                 out.close();
 421                         }
 422                         delete listToMakeNameFile;
 423                         delete convert;
 424
 425                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
 426                 }
 427                 if (m->control_pressed) { return 0; }
 428
 429                 estart = time(NULL);
 430                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
 431
 432                 //split matrix into non-overlapping groups
 433                 SplitMatrix* split;
 434                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large);                                                    }
 435                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large);                                    }
 436                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);  }
 437                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
 438
 439                 split->split();
 440
 441                 if (m->control_pressed) { delete split; return 0; }
 442
 443                 singletonName = split->getSingletonNames();
 444                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
 445                 delete split;
 446
 447                 //output a merged distance file
 448                 if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
 449
 450
 451                 if (m->control_pressed) { return 0; }
 452
 453                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
 454                 estart = time(NULL);
 455
 456         if (!runCluster) {
 457 #ifdef USE_MPI
 458     }
 459 #endif
 460                 m->mothurOutEndLine();
 461                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 462                 for (int i = 0; i < distName.size(); i++) {     m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine();      }
 463                 m->mothurOutEndLine();
 464                 return 0;
 465
 466         }
 467
 468                 //****************** break up files between processes and cluster each file set ******************************//
 469         #ifdef USE_MPI
 470                         ////you are process 0 from above////
 471
 472                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 473                         dividedNames.resize(processors);
 474
 475                         //for each file group figure out which process will complete it
 476                         //want to divide the load intelligently so the big files are spread between processes
 477                         for (int i = 0; i < distName.size(); i++) {
 478                                 int processToAssign = (i+1) % processors;
 479                                 if (processToAssign == 0) { processToAssign = processors; }
 480
 481                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 482                         }
 483
 484                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 485                         for (int i = 0; i < processors; i++) {
 486                                 int remainder = ((i+1) % processors);
 487                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 488                         }
 489
 490
 491                         //send each child the list of files it needs to process
 492                         for(int i = 1; i < processors; i++) {
 493                                 //send number of file pairs
 494                                 int num = dividedNames[i].size();
 495                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 496
 497                                 for (int j = 0; j < num; j++) { //send filenames to process i
 498                                         char tempDistFileName[1024];
 499                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
 500                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
 501
 502                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 503                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 504
 505                                         char tempNameFileName[1024];
 506                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
 507                                         int lengthName = (dividedNames[i][j].begin()->second).length();
 508
 509                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 510                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 511                                 }
 512                         }
 513
 514                         //process your share
 515                         listFileNames = cluster(dividedNames[0], labels);
 516
 517                         //receive the other processes info
 518                         for(int i = 1; i < processors; i++) {
 519                                 int num = dividedNames[i].size();
 520
 521                                 double tempCutoff;
 522                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
 523                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
 524
 525                                 //send list filenames to root process
 526                                 for (int j = 0; j < num; j++) {
 527                                         int lengthList = 0;
 528                                         char tempListFileName[1024];
 529
 530                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 531                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 532
 533                                         string myListFileName = tempListFileName;
 534                                         myListFileName = myListFileName.substr(0, lengthList);
 535
 536                                         listFileNames.push_back(myListFileName);
 537                                 }
 538
 539                                 //send Labels to root process
 540                                 int numLabels = 0;
 541                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 542
 543                                 for (int j = 0; j < numLabels; j++) {
 544                                         int lengthLabel = 0;
 545                                         char tempLabel[100];
 546
 547                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 548                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 549
 550                                         string myLabel = tempLabel;
 551                                         myLabel = myLabel.substr(0, lengthLabel);
 552
 553                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
 554                                 }
 555                         }
 556
 557                 }else { //you are a child process
 558                         vector < map<string, string> >  myNames;
 559
 560                         //recieve the files you need to process
 561                         //receive number of file pairs
 562                         int num = 0;
 563                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 564
 565                         myNames.resize(num);
 566
 567                         for (int j = 0; j < num; j++) { //receive filenames to process
 568                                 int lengthDist = 0;
 569                                 char tempDistFileName[1024];
 570
 571                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 572                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 573
 574                                 string myDistFileName = tempDistFileName;
 575                                 myDistFileName = myDistFileName.substr(0, lengthDist);
 576
 577                                 int lengthName = 0;
 578                                 char tempNameFileName[1024];
 579
 580                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 581                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 582
 583                                 string myNameFileName = tempNameFileName;
 584                                 myNameFileName = myNameFileName.substr(0, lengthName);
 585
 586                                 //save file name
 587                                 myNames[j][myDistFileName] = myNameFileName;
 588                         }
 589
 590                         //process them
 591                         listFileNames = cluster(myNames, labels);
 592
 593                         //send cutoff
 594                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
 595
 596                         //send list filenames to root process
 597                         for (int j = 0; j < num; j++) {
 598                                 char tempListFileName[1024];
 599                                 strcpy(tempListFileName, listFileNames[j].c_str());
 600                                 int lengthList = listFileNames[j].length();
 601
 602                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 603                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 604                         }
 605
 606                         //send Labels to root process
 607                         int numLabels = labels.size();
 608                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 609
 610                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
 611                                 char tempLabel[100];
 612                                 strcpy(tempLabel, (*it).c_str());
 613                                 int lengthLabel = (*it).length();
 614
 615                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 616                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 617                         }
 618                 }
 619
 620                 //make everyone wait
 621                 MPI_Barrier(MPI_COMM_WORLD);
 622
 623         #else
 624                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
 625                 //sanity check
 626                 if (processors > distName.size()) { processors = distName.size(); }
 627
 628                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 629                                 if(processors == 1){
 630                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 631                                 }else{
 632                                         listFileNames = createProcesses(distName, labels);
 633                 }
 634                 #else
 635                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 636                 #endif
 637         #endif
 638                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
 639
 640                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
 641
 642                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
 643
 644                 //****************** merge list file and create rabund and sabund files ******************************//
 645                 estart = time(NULL);
 646                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
 647
 648                 #ifdef USE_MPI
 649                         if (pid == 0) { //only process 0 merges
 650                 #endif
 651
 652                 ListVector* listSingle;
 653                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 654
 655                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 656
 657                 mergeLists(listFileNames, labelBins, listSingle);
 658
 659                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 660
 661                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 662
 663                 //set list file as new current listfile
 664                 string current = "";
 665                 itTypes = outputTypes.find("list");
 666                 if (itTypes != outputTypes.end()) {
 667                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
 668                 }
 669
 670                 //set rabund file as new current rabundfile
 671                 itTypes = outputTypes.find("rabund");
 672                 if (itTypes != outputTypes.end()) {
 673                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
 674                 }
 675
 676                 //set sabund file as new current sabundfile
 677                 itTypes = outputTypes.find("sabund");
 678                 if (itTypes != outputTypes.end()) {
 679                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
 680                 }
 681
 682                 m->mothurOutEndLine();
 683                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 684                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
 685                 m->mothurOutEndLine();
 686
 687                 #ifdef USE_MPI
 688                         } //only process 0 merges
 689
 690                         //make everyone wait
 691                         MPI_Barrier(MPI_COMM_WORLD);
 692                 #endif
 693
 694                 return 0;
 695         }
 696         catch(exception& e) {
 697                 m->errorOut(e, "ClusterSplitCommand", "execute");
 698                 exit(1);
 699         }
 700 }
 701 //**********************************************************************************************************************
 702 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
 703         try {
 704
 705                 map<float, int> labelBin;
 706                 vector<float> orderFloat;
 707                 int numSingleBins;
 708
 709                 //read in singletons
 710                 if (singleton != "none") {
 711
 712             ifstream in;
 713             m->openInputFile(singleton, in);
 714
 715                         string firstCol, secondCol;
 716                         listSingle = new ListVector();
 717
 718             if (countfile != "") { m->getline(in); m->gobble(in); }
 719
 720                         while (!in.eof()) {
 721                                 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
 722                                 if (countfile == "") { listSingle->push_back(secondCol); }
 723                 else { listSingle->push_back(firstCol); }
 724                         }
 725
 726                         in.close();
 727                         m->mothurRemove(singleton);
 728
 729                         numSingleBins = listSingle->getNumBins();
 730                 }else{  listSingle = NULL; numSingleBins = 0;  }
 731
 732                 //go through users set and make them floats so we can sort them
 733                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
 734                         float temp = -10.0;
 735
 736                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
 737                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
 738
 739                         if (temp <= cutoff) {
 740                                 orderFloat.push_back(temp);
 741                                 labelBin[temp] = numSingleBins; //initialize numbins
 742                         }
 743                 }
 744
 745                 //sort order
 746                 sort(orderFloat.begin(), orderFloat.end());
 747                 userLabels.clear();
 748
 749                 //get the list info from each file
 750                 for (int k = 0; k < listNames.size(); k++) {
 751
 752                         if (m->control_pressed) {
 753                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
 754                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
 755                                 return labelBin;
 756                         }
 757
 758                         InputData* input = new InputData(listNames[k], "list");
 759                         ListVector* list = input->getListVector();
 760                         string lastLabel = list->getLabel();
 761
 762                         string filledInList = listNames[k] + "filledInTemp";
 763                         ofstream outFilled;
 764                         m->openOutputFile(filledInList, outFilled);
 765
 766                         //for each label needed
 767                         for(int l = 0; l < orderFloat.size(); l++){
 768
 769                                 string thisLabel;
 770                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
 771                                 else { thisLabel = toString(orderFloat[l],  length-1);  }
 772
 773                                 //this file has reached the end
 774                                 if (list == NULL) {
 775                                         list = input->getListVector(lastLabel, true);
 776                                 }else{  //do you have the distance, or do you need to fill in
 777
 778                                         float labelFloat;
 779                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
 780                                         else { convert(list->getLabel(), labelFloat); }
 781
 782                                         //check for missing labels
 783                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
 784                                                 //if its bigger get last label, otherwise keep it
 785                                                 delete list;
 786                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
 787                                         }
 788                                         lastLabel = list->getLabel();
 789                                 }
 790
 791                                 //print to new file
 792                                 list->setLabel(thisLabel);
 793                                 list->print(outFilled);
 794
 795                                 //update labelBin
 796                                 labelBin[orderFloat[l]] += list->getNumBins();
 797
 798                                 delete list;
 799
 800                                 list = input->getListVector();
 801                         }
 802
 803                         if (list != NULL) { delete list; }
 804                         delete input;
 805
 806                         outFilled.close();
 807                         m->mothurRemove(listNames[k]);
 808                         rename(filledInList.c_str(), listNames[k].c_str());
 809                 }
 810
 811                 return labelBin;
 812         }
 813         catch(exception& e) {
 814                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
 815                 exit(1);
 816         }
 817 }
 818 //**********************************************************************************************************************
 819 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
 820         try {
 821                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
 822                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
 823
 824         string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund");
 825         string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund");
 826         string listFileName = fileroot+ tag + ".";
 827         if (countfile != "") { listFileName += "unique_"; }
 828         listFileName += getOutputFileNameTag("list");
 829
 830         if (countfile == "") {
 831             m->openOutputFile(sabundFileName,   outSabund);
 832             m->openOutputFile(rabundFileName,   outRabund);
 833             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
 834             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
 835
 836         }
 837                 m->openOutputFile(listFileName, outList);
 838         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
 839
 840
 841                 map<float, int>::iterator itLabel;
 842
 843                 //for each label needed
 844                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
 845
 846                         string thisLabel;
 847                         if (itLabel->first == -1) { thisLabel = "unique"; }
 848                         else { thisLabel = toString(itLabel->first,  length-1);  }
 849
 850                         outList << thisLabel << '\t' << itLabel->second << '\t';
 851
 852             RAbundVector* rabund = NULL;
 853             if (countfile == "") {
 854                 rabund = new RAbundVector();
 855                 rabund->setLabel(thisLabel);
 856             }
 857
 858                         //add in singletons
 859                         if (listSingle != NULL) {
 860                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
 861                                         outList << listSingle->get(j) << '\t';
 862                                         if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
 863                                 }
 864                         }
 865
 866                         //get the list info from each file
 867                         for (int k = 0; k < listNames.size(); k++) {
 868
 869                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } if (rabund != NULL) { delete rabund; } return 0; }
 870
 871                                 InputData* input = new InputData(listNames[k], "list");
 872                                 ListVector* list = input->getListVector(thisLabel);
 873
 874                                 //this file has reached the end
 875                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }
 876                                 else {
 877                                         for (int j = 0; j < list->getNumBins(); j++) {
 878                                                 outList << list->get(j) << '\t';
 879                                                 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
 880                                         }
 881                                         delete list;
 882                                 }
 883                                 delete input;
 884                         }
 885
 886             if (countfile == "") {
 887                 SAbundVector sabund = rabund->getSAbundVector();
 888                 sabund.print(outSabund);
 889                 rabund->print(outRabund);
 890             }
 891                         outList << endl;
 892
 893                         if (rabund != NULL) { delete rabund; }
 894                 }
 895
 896                 outList.close();
 897         if (countfile == "") {
 898             outRabund.close();
 899             outSabund.close();
 900                 }
 901                 if (listSingle != NULL) { delete listSingle;  }
 902
 903                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
 904
 905                 return 0;
 906         }
 907         catch(exception& e) {
 908                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
 909                 exit(1);
 910         }
 911 }
 912
 913 //**********************************************************************************************************************
 914
 915 void ClusterSplitCommand::printData(ListVector* oldList){
 916         try {
 917                 string label = oldList->getLabel();
 918                 RAbundVector oldRAbund = oldList->getRAbundVector();
 919
 920                 oldRAbund.setLabel(label);
 921                 if (m->isTrue(showabund)) {
 922                         oldRAbund.getSAbundVector().print(cout);
 923                 }
 924                 oldRAbund.print(outRabund);
 925                 oldRAbund.getSAbundVector().print(outSabund);
 926
 927                 oldList->print(outList);
 928         }
 929         catch(exception& e) {
 930                 m->errorOut(e, "ClusterSplitCommand", "printData");
 931                 exit(1);
 932         }
 933 }
 934 //**********************************************************************************************************************
 935 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
 936         try {
 937
 938         vector<string> listFiles;
 939         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 940         dividedNames.resize(processors);
 941
 942         //for each file group figure out which process will complete it
 943         //want to divide the load intelligently so the big files are spread between processes
 944         for (int i = 0; i < distName.size(); i++) {
 945             //cout << i << endl;
 946             int processToAssign = (i+1) % processors;
 947             if (processToAssign == 0) { processToAssign = processors; }
 948
 949             dividedNames[(processToAssign-1)].push_back(distName[i]);
 950             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
 951         }
 952
 953         //not lets reverse the order of ever other process, so we balance big files running with little ones
 954         for (int i = 0; i < processors; i++) {
 955             //cout << i << endl;
 956             int remainder = ((i+1) % processors);
 957             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 958         }
 959
 960         if (m->control_pressed) { return listFiles; }
 961
 962         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 963                 int process = 1;
 964                 processIDS.clear();
 965
 966                 //loop through and create all the processes you want
 967                 while (process != processors) {
 968                         int pid = fork();
 969
 970                         if (pid > 0) {
 971                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 972                                 process++;
 973                         }else if (pid == 0){
 974                                 set<string> labels;
 975                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
 976
 977                                 //write out names to file
 978                                 string filename = toString(getpid()) + ".temp";
 979                                 ofstream out;
 980                                 m->openOutputFile(filename, out);
 981                                 out << tag << endl;
 982                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
 983                                 out.close();
 984
 985                                 //print out labels
 986                                 ofstream outLabels;
 987                                 filename = toString(getpid()) + ".temp.labels";
 988                                 m->openOutputFile(filename, outLabels);
 989
 990                                 outLabels << cutoff << endl;
 991                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
 992                                         outLabels << (*it) << endl;
 993                                 }
 994                                 outLabels.close();
 995
 996                                 exit(0);
 997                         }else {
 998                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
 999                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1000                                 exit(0);
1001                         }
1002                 }
1003
1004         //do your part
1005         listFiles = cluster(dividedNames[0], labels);
1006
1007                 //force parent to wait until all the processes are done
1008                 for (int i=0;i< processIDS.size();i++) {
1009                         int temp = processIDS[i];
1010                         wait(&temp);
1011                 }
1012
1013         //get list of list file names from each process
1014         for(int i=0;i<processIDS.size();i++){
1015             string filename = toString(processIDS[i]) + ".temp";
1016             ifstream in;
1017             m->openInputFile(filename, in);
1018
1019             in >> tag; m->gobble(in);
1020
1021             while(!in.eof()) {
1022                 string tempName;
1023                 in >> tempName; m->gobble(in);
1024                 listFiles.push_back(tempName);
1025             }
1026             in.close();
1027             m->mothurRemove((toString(processIDS[i]) + ".temp"));
1028
1029             //get labels
1030             filename = toString(processIDS[i]) + ".temp.labels";
1031             ifstream in2;
1032             m->openInputFile(filename, in2);
1033
1034             float tempCutoff;
1035             in2 >> tempCutoff; m->gobble(in2);
1036             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1037
1038             while(!in2.eof()) {
1039                 string tempName;
1040                 in2 >> tempName; m->gobble(in2);
1041                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1042             }
1043             in2.close();
1044             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1045         }
1046
1047
1048     #else
1049
1050         //////////////////////////////////////////////////////////////////////////////////////////////////////
1051                 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
1052                 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1053                 //Taking advantage of shared memory to allow both threads to add labels.
1054                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1055                 /*
1056                 vector<clusterData*> pDataArray;
1057                 DWORD   dwThreadIdArray[processors-1];
1058                 HANDLE  hThreadArray[processors-1];
1059
1060                 //Create processor worker threads.
1061                 for( int i=1; i<processors; i++ ){
1062                         // Allocate memory for thread data.
1063                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1064                         pDataArray.push_back(tempCluster);
1065                         processIDS.push_back(i);
1066
1067                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1068                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1069                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
1070
1071                 }
1072
1073         //do your part
1074         listFiles = cluster(dividedNames[0], labels);
1075
1076                 //Wait until all threads have terminated.
1077                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1078
1079                 //Close all thread handles and free memory allocations.
1080                 for(int i=0; i < pDataArray.size(); i++){
1081             //get tag
1082             tag = pDataArray[i]->tag;
1083             //get listfiles created
1084             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1085             //get labels
1086             set<string>::iterator it;
1087             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1088                         //check cutoff
1089             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1090                         CloseHandle(hThreadArray[i]);
1091                         delete pDataArray[i];
1092                 }
1093 */
1094         #endif
1095
1096         return listFiles;
1097
1098         }
1099         catch(exception& e) {
1100                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1101                 exit(1);
1102         }
1103 }
1104 //**********************************************************************************************************************
1105
1106 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1107         try {
1108
1109                 vector<string> listFileNames;
1110                 double smallestCutoff = cutoff;
1111
1112                 //cluster each distance file
1113                 for (int i = 0; i < distNames.size(); i++) {
1114
1115                         string thisNamefile = distNames[i].begin()->second;
1116                         string thisDistFile = distNames[i].begin()->first;
1117
1118                         string listFileName = "";
1119             if (classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff);   }
1120             else            {  listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff);          }
1121
1122                         if (m->control_pressed) { //clean up
1123                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1124                                 listFileNames.clear(); return listFileNames;
1125                         }
1126
1127             listFileNames.push_back(listFileName);
1128         }
1129
1130                 cutoff = smallestCutoff;
1131
1132                 return listFileNames;
1133
1134         }
1135         catch(exception& e) {
1136                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1137                 exit(1);
1138         }
1139
1140
1141 }
1142 //**********************************************************************************************************************
1143 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1144         try {
1145         string listFileName = "";
1146
1147         ListVector* list = NULL;
1148         ListVector oldList;
1149         RAbundVector* rabund = NULL;
1150
1151 #ifdef USE_MPI
1152         int pid;
1153         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1154
1155         //output your files too
1156         if (pid != 0) {
1157             cout << endl << "Reading " << thisDistFile << endl;
1158         }
1159 #endif
1160
1161         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1162
1163         //reads phylip file storing data in 2D vector, also fills list and rabund
1164         bool sim = false;
1165                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1166
1167         NameAssignment* nameMap = NULL;
1168         CountTable* ct = NULL;
1169         if(namefile != ""){
1170                         nameMap = new NameAssignment(thisNamefile);
1171                         nameMap->readMap();
1172             cluster->readPhylipFile(thisDistFile, nameMap);
1173                 }else if (countfile != "") {
1174             ct = new CountTable();
1175             ct->readTable(thisNamefile);
1176             cluster->readPhylipFile(thisDistFile, ct);
1177         }
1178         tag = cluster->getTag();
1179
1180                 if (m->control_pressed) { if(namefile != ""){   delete nameMap; }
1181             else { delete ct; } delete cluster; return 0; }
1182
1183                 list = cluster->getListVector();
1184                 rabund = cluster->getRAbundVector();
1185
1186                 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1187                 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1188         listFileName = fileroot+ tag + ".list";
1189
1190         ofstream listFile;
1191                 m->openOutputFile(fileroot+ tag + ".list",      listFile);
1192
1193                 float previousDist = 0.00000;
1194                 float rndPreviousDist = 0.00000;
1195                 oldList = *list;
1196
1197 #ifdef USE_MPI
1198         //output your files too
1199         if (pid != 0) {
1200             cout << endl << "Clustering " << thisDistFile << endl;
1201         }
1202 #endif
1203
1204         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1205
1206                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1207                         if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  if(namefile != ""){    delete nameMap; }
1208                 else { delete ct; } return listFileName;  }
1209
1210                         cluster->update(cutoff);
1211
1212                         float dist = cluster->getSmallDist();
1213                         float rndDist;
1214                         if (hard) {
1215                                 rndDist = m->ceilDist(dist, precision);
1216                         }else{
1217                                 rndDist = m->roundDist(dist, precision);
1218                         }
1219
1220             if(previousDist <= 0.0000 && dist != previousDist){
1221                 oldList.setLabel("unique");
1222                 oldList.print(listFile);
1223                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1224             }
1225             else if(rndDist != rndPreviousDist){
1226                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1227                 oldList.print(listFile);
1228                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1229             }
1230
1231
1232                         previousDist = dist;
1233                         rndPreviousDist = rndDist;
1234                         oldList = *list;
1235                 }
1236
1237                 if(previousDist <= 0.0000){
1238             oldList.setLabel("unique");
1239             oldList.print(listFile);
1240             if (labels.count("unique") == 0) { labels.insert("unique"); }
1241         }
1242         else if(rndPreviousDist<cutoff){
1243             oldList.setLabel(toString(rndPreviousDist,  length-1));
1244             oldList.print(listFile);
1245             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1246         }
1247
1248
1249                 listFile.close();
1250
1251                 delete cluster;  delete list; delete rabund;
1252         if(namefile != ""){     delete nameMap; }
1253         else { delete ct; }
1254
1255         m->mothurRemove(thisDistFile);
1256         m->mothurRemove(thisNamefile);
1257
1258         return listFileName;
1259
1260         }
1261         catch(exception& e) {
1262                 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1263                 exit(1);
1264         }
1265 }
1266
1267 //**********************************************************************************************************************
1268 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1269         try {
1270         string listFileName = "";
1271
1272         Cluster* cluster = NULL;
1273         SparseDistanceMatrix* matrix = NULL;
1274         ListVector* list = NULL;
1275         ListVector oldList;
1276         RAbundVector* rabund = NULL;
1277
1278         if (m->control_pressed) { return listFileName; }
1279
1280 #ifdef USE_MPI
1281         int pid;
1282         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1283
1284         //output your files too
1285         if (pid != 0) {
1286             cout << endl << "Reading " << thisDistFile << endl;
1287         }
1288 #endif
1289
1290         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1291
1292         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1293         read->setCutoff(cutoff);
1294
1295         NameAssignment* nameMap = NULL;
1296         CountTable* ct = NULL;
1297                 if(namefile != ""){
1298                         nameMap = new NameAssignment(thisNamefile);
1299                         nameMap->readMap();
1300             read->read(nameMap);
1301                 }else if (countfile != "") {
1302             ct = new CountTable();
1303             ct->readTable(thisNamefile);
1304             read->read(ct);
1305         }else { read->read(nameMap); }
1306
1307                 list = read->getListVector();
1308         oldList = *list;
1309                 matrix = read->getDMatrix();
1310
1311                 if(countfile != "") {
1312             rabund = new RAbundVector();
1313             createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1314             delete ct;
1315         }else { rabund = new RAbundVector(list->getRAbundVector()); }
1316
1317         delete read;  read = NULL;
1318         if (namefile != "") { delete nameMap; nameMap = NULL; }
1319
1320
1321 #ifdef USE_MPI
1322         //output your files too
1323         if (pid != 0) {
1324             cout << endl << "Clustering " << thisDistFile << endl;
1325         }
1326 #endif
1327
1328         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1329
1330         //create cluster
1331         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1332         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1333         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
1334         tag = cluster->getTag();
1335
1336         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1337         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1338
1339         ofstream listFile;
1340         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1341
1342         listFileName = fileroot+ tag + ".list";
1343
1344         float previousDist = 0.00000;
1345         float rndPreviousDist = 0.00000;
1346
1347         oldList = *list;
1348
1349         print_start = true;
1350         start = time(NULL);
1351         double saveCutoff = cutoff;
1352
1353         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1354
1355             if (m->control_pressed) { //clean up
1356                 delete matrix; delete list;     delete cluster; delete rabund;
1357                 listFile.close();
1358                 m->mothurRemove(listFileName);
1359                 return listFileName;
1360             }
1361
1362             cluster->update(saveCutoff);
1363
1364             float dist = matrix->getSmallDist();
1365             float rndDist;
1366             if (hard) {
1367                 rndDist = m->ceilDist(dist, precision);
1368             }else{
1369                 rndDist = m->roundDist(dist, precision);
1370             }
1371
1372             if(previousDist <= 0.0000 && dist != previousDist){
1373                 oldList.setLabel("unique");
1374                 oldList.print(listFile);
1375                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1376             }
1377             else if(rndDist != rndPreviousDist){
1378                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1379                 oldList.print(listFile);
1380                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1381             }
1382
1383             previousDist = dist;
1384             rndPreviousDist = rndDist;
1385             oldList = *list;
1386         }
1387
1388
1389         if(previousDist <= 0.0000){
1390             oldList.setLabel("unique");
1391             oldList.print(listFile);
1392             if (labels.count("unique") == 0) { labels.insert("unique"); }
1393         }
1394         else if(rndPreviousDist<cutoff){
1395             oldList.setLabel(toString(rndPreviousDist,  length-1));
1396             oldList.print(listFile);
1397             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1398         }
1399
1400         delete matrix; delete list;     delete cluster; delete rabund;
1401         matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1402         listFile.close();
1403
1404         if (m->control_pressed) { //clean up
1405             m->mothurRemove(listFileName);
1406             return listFileName;
1407         }
1408
1409         m->mothurRemove(thisDistFile);
1410         m->mothurRemove(thisNamefile);
1411
1412         if (saveCutoff != cutoff) {
1413             if (hard)   {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1414             else                {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1415
1416             m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1417         }
1418
1419         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1420
1421         return listFileName;
1422
1423         }
1424         catch(exception& e) {
1425                 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1426                 exit(1);
1427         }
1428 }
1429 //**********************************************************************************************************************
1430
1431 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1432         try{
1433
1434 #ifdef USE_MPI
1435                 int pid;
1436                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1437
1438                 if (pid != 0) {
1439 #endif
1440
1441                 string thisOutputDir = outputDir;
1442                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1443                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("column");
1444                 m->mothurRemove(outputFileName);
1445
1446
1447                 for (int i = 0; i < distNames.size(); i++) {
1448                         if (m->control_pressed) {  return 0; }
1449
1450                         string thisDistFile = distNames[i].begin()->first;
1451
1452                         m->appendFiles(thisDistFile, outputFileName);
1453                 }
1454
1455                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1456
1457 #ifdef USE_MPI
1458                 }
1459 #endif
1460
1461                 return 0;
1462
1463
1464         }
1465         catch(exception& e) {
1466                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1467                 exit(1);
1468         }
1469 }
1470 //**********************************************************************************************************************
1471 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1472     try {
1473         rabund->setLabel(list->getLabel());
1474         for(int i = 0; i < list->getNumBins(); i++) {
1475             if (m->control_pressed) { break; }
1476             vector<string> binNames;
1477             string bin = list->get(i);
1478             m->splitAtComma(bin, binNames);
1479             int total = 0;
1480             for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
1481             rabund->push_back(total);
1482         }
1483         return 0;
1484     }
1485     catch(exception& e) {
1486                 m->errorOut(e, "ClusterCommand", "createRabund");
1487                 exit(1);
1488         }
1489
1490 }
1491 //**********************************************************************************************************************