clustersplitcommand.cpp

   1 /*
   2  *  clustersplitcommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 5/19/10.
   6  *  Copyright 2010 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "clustersplitcommand.h"
  11
  12
  13 //**********************************************************************************************************************
  14 vector<string> ClusterSplitCommand::setParameters(){
  15         try {
  16                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
  17                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
  18                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
  19                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
  20         CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
  21                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
  22                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
  23                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "","",false,false,true); parameters.push_back(psplitmethod);
  24                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
  25                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
  26         CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
  27                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
  28                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
  29                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff);
  30                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
  31                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false); parameters.push_back(pmethod);
  32                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
  33         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
  34                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
  35                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
  36
  37                 vector<string> myArray;
  38                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
  39                 return myArray;
  40         }
  41         catch(exception& e) {
  42                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
  43                 exit(1);
  44         }
  45 }
  46 //**********************************************************************************************************************
  47 string ClusterSplitCommand::getHelpString(){
  48         try {
  49                 string helpString = "";
  50                 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
  51                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
  52                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
  53                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
  54                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
  55                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
  56                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
  57                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
  58                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
  59                 helpString += "The name parameter allows you to enter your name file. \n";
  60         helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
  61         helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
  62                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
  63                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
  64                 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
  65                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
  66                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
  67                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
  68                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
  69         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic.  It is only valid with splitmethod=fasta. Default=f.\n";
  70 #ifdef USE_MPI
  71                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
  72 #endif
  73                 helpString += "The cluster.split command should be in the following format: \n";
  74                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
  75                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
  76                 return helpString;
  77         }
  78         catch(exception& e) {
  79                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
  80                 exit(1);
  81         }
  82 }
  83 //**********************************************************************************************************************
  84 string ClusterSplitCommand::getOutputPattern(string type) {
  85     try {
  86         string pattern = "";
  87
  88         if (type == "list") {  pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
  89         else if (type == "rabund") {  pattern = "[filename],[clustertag],rabund"; }
  90         else if (type == "sabund") {  pattern = "[filename],[clustertag],sabund"; }
  91         else if (type == "column") {  pattern = "[filename],dist"; }
  92         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
  93
  94         return pattern;
  95     }
  96     catch(exception& e) {
  97         m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
  98         exit(1);
  99     }
 100 }
 101 //**********************************************************************************************************************
 102 ClusterSplitCommand::ClusterSplitCommand(){
 103         try {
 104                 abort = true; calledHelp = true;
 105                 setParameters();
 106                 vector<string> tempOutNames;
 107                 outputTypes["list"] = tempOutNames;
 108                 outputTypes["rabund"] = tempOutNames;
 109                 outputTypes["sabund"] = tempOutNames;
 110                 outputTypes["column"] = tempOutNames;
 111         }
 112         catch(exception& e) {
 113                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 114                 exit(1);
 115         }
 116 }
 117 //**********************************************************************************************************************
 118 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
 119 ClusterSplitCommand::ClusterSplitCommand(string option)  {
 120         try{
 121                 abort = false; calledHelp = false;
 122                 format = "";
 123
 124                 //allow user to run help
 125                 if(option == "help") { help(); abort = true; calledHelp = true; }
 126                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
 127
 128                 else {
 129                         vector<string> myArray = setParameters();
 130
 131                         OptionParser parser(option);
 132                         map<string,string> parameters = parser.getParameters();
 133
 134                         ValidParameters validParameter("cluster.split");
 135
 136                         //check to make sure all parameters are valid for command
 137                         map<string,string>::iterator it;
 138                         for (it = parameters.begin(); it != parameters.end(); it++) {
 139                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
 140                                         abort = true;
 141                                 }
 142                         }
 143
 144                         //initialize outputTypes
 145                         vector<string> tempOutNames;
 146                         outputTypes["list"] = tempOutNames;
 147                         outputTypes["rabund"] = tempOutNames;
 148                         outputTypes["sabund"] = tempOutNames;
 149                         outputTypes["column"] = tempOutNames;
 150
 151                         //if the user changes the output directory command factory will send this info to us in the output parameter
 152                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
 153
 154                                 //if the user changes the input directory command factory will send this info to us in the output parameter
 155                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
 156                         if (inputDir == "not found"){   inputDir = "";          }
 157                         else {
 158                                 string path;
 159                                 it = parameters.find("phylip");
 160                                 //user has given a template file
 161                                 if(it != parameters.end()){
 162                                         path = m->hasPath(it->second);
 163                                         //if the user has not given a path then, add inputdir. else leave path alone.
 164                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
 165                                 }
 166
 167                                 it = parameters.find("column");
 168                                 //user has given a template file
 169                                 if(it != parameters.end()){
 170                                         path = m->hasPath(it->second);
 171                                         //if the user has not given a path then, add inputdir. else leave path alone.
 172                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
 173                                 }
 174
 175                                 it = parameters.find("name");
 176                                 //user has given a template file
 177                                 if(it != parameters.end()){
 178                                         path = m->hasPath(it->second);
 179                                         //if the user has not given a path then, add inputdir. else leave path alone.
 180                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
 181                                 }
 182
 183                                 it = parameters.find("taxonomy");
 184                                 //user has given a template file
 185                                 if(it != parameters.end()){
 186                                         path = m->hasPath(it->second);
 187                                         //if the user has not given a path then, add inputdir. else leave path alone.
 188                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
 189                                 }
 190
 191                                 it = parameters.find("fasta");
 192                                 //user has given a template file
 193                                 if(it != parameters.end()){
 194                                         path = m->hasPath(it->second);
 195                                         //if the user has not given a path then, add inputdir. else leave path alone.
 196                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
 197                                 }
 198
 199                 it = parameters.find("count");
 200                                 //user has given a template file
 201                                 if(it != parameters.end()){
 202                                         path = m->hasPath(it->second);
 203                                         //if the user has not given a path then, add inputdir. else leave path alone.
 204                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
 205                                 }
 206                         }
 207
 208                         //check for required parameters
 209                         phylipfile = validParameter.validFile(parameters, "phylip", true);
 210                         if (phylipfile == "not open") { abort = true; }
 211                         else if (phylipfile == "not found") { phylipfile = ""; }
 212                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
 213
 214                         columnfile = validParameter.validFile(parameters, "column", true);
 215                         if (columnfile == "not open") { abort = true; }
 216                         else if (columnfile == "not found") { columnfile = ""; }
 217                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
 218
 219                         namefile = validParameter.validFile(parameters, "name", true);
 220                         if (namefile == "not open") { abort = true; namefile = "";}
 221                         else if (namefile == "not found") { namefile = "";  }
 222                         else { m->setNameFile(namefile); }
 223
 224             countfile = validParameter.validFile(parameters, "count", true);
 225                         if (countfile == "not open") { abort = true; countfile = "";}
 226                         else if (countfile == "not found") { countfile = "";  }
 227                         else { m->setCountTableFile(countfile); }
 228
 229                         fastafile = validParameter.validFile(parameters, "fasta", true);
 230                         if (fastafile == "not open") { abort = true; }
 231                         else if (fastafile == "not found") { fastafile = ""; }
 232                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
 233
 234                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
 235                         if (taxFile == "not open") { taxFile = ""; abort = true; }
 236                         else if (taxFile == "not found") { taxFile = ""; }
 237                         else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
 238
 239                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
 240                                 //is there are current file available for either of these?
 241                                 //give priority to column, then phylip, then fasta
 242                                 columnfile = m->getColumnFile();
 243                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
 244                                 else {
 245                                         phylipfile = m->getPhylipFile();
 246                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
 247                                         else {
 248                                                 fastafile = m->getFastaFile();
 249                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
 250                                                 else {
 251                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
 252                                                         abort = true;
 253                                                 }
 254                                         }
 255                                 }
 256                         }
 257                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
 258
 259             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
 260
 261                         if (columnfile != "") {
 262                                 if ((namefile == "") && (countfile == "")) {
 263                                         namefile = m->getNameFile();
 264                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 265                                         else {
 266                                                 countfile = m->getCountTableFile();
 267                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
 268                         else {
 269                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
 270                             abort = true;
 271                         }
 272                                         }
 273                                 }
 274                         }
 275
 276                         if (fastafile != "") {
 277                                 if (taxFile == "") {
 278                                         taxFile = m->getTaxonomyFile();
 279                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
 280                                         else {
 281                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
 282                                                 abort = true;
 283                                         }
 284                                 }
 285
 286                                 if ((namefile == "") && (countfile == "")) {
 287                                         namefile = m->getNameFile();
 288                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 289                                         else {
 290                                                 countfile = m->getCountTableFile();
 291                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
 292                         else {
 293                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine();
 294                             abort = true;
 295                         }
 296                                         }
 297                                 }
 298                         }
 299
 300                         //check for optional parameter and set defaults
 301                         // ...at some point should added some additional type checking...
 302                         //get user cutoff and precision or use defaults
 303                         string temp;
 304                         temp = validParameter.validFile(parameters, "precision", false);
 305                         if (temp == "not found") { temp = "100"; }
 306                         //saves precision legnth for formatting below
 307                         length = temp.length();
 308                         m->mothurConvert(temp, precision);
 309
 310                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
 311                         hard = m->isTrue(temp);
 312
 313                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
 314                         large = m->isTrue(temp);
 315
 316                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
 317                         m->setProcessors(temp);
 318                         m->mothurConvert(temp, processors);
 319
 320                         temp = validParameter.validFile(parameters, "splitmethod", false);
 321                         if ((splitmethod != "fasta") && (splitmethod != "classify")) {
 322                                 if (temp == "not found")  { splitmethod = "distance"; }
 323                                 else {  splitmethod = temp; }
 324                         }
 325
 326             temp = validParameter.validFile(parameters, "classic", false);                      if (temp == "not found") { temp = "F"; }
 327                         classic = m->isTrue(temp);
 328
 329             if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
 330
 331                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
 332                         m->mothurConvert(temp, cutoff);
 333                         cutoff += (5 / (precision * 10.0));
 334
 335                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
 336                         m->mothurConvert(temp, taxLevelCutoff);
 337
 338                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
 339
 340                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
 341                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 342
 343                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
 344                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
 345
 346                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
 347
 348                         showabund = validParameter.validFile(parameters, "showabund", false);
 349                         if (showabund == "not found") { showabund = "T"; }
 350
 351             temp = validParameter.validFile(parameters, "cluster", false);  if (temp == "not found") { temp = "T"; }
 352             runCluster = m->isTrue(temp);
 353
 354                         timing = validParameter.validFile(parameters, "timing", false);
 355                         if (timing == "not found") { timing = "F"; }
 356
 357                 }
 358         }
 359         catch(exception& e) {
 360                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 361                 exit(1);
 362         }
 363 }
 364
 365 //**********************************************************************************************************************
 366
 367 int ClusterSplitCommand::execute(){
 368         try {
 369
 370                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
 371
 372                 time_t estart;
 373                 vector<string> listFileNames;
 374                 set<string> labels;
 375                 string singletonName = "";
 376                 double saveCutoff = cutoff;
 377
 378                 //****************** file prep work ******************************//
 379                 #ifdef USE_MPI
 380                         int pid;
 381                         int tag = 2001;
 382                         MPI_Status status;
 383                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
 384                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 385
 386                         if (pid == 0) { //only process 0 converts and splits
 387
 388                 #endif
 389
 390                 //if user gave a phylip file convert to column file
 391                 if (format == "phylip") {
 392                         estart = time(NULL);
 393                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
 394
 395                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
 396
 397                         NameAssignment* nameMap = NULL;
 398                         convert->setFormat("phylip");
 399                         convert->read(nameMap);
 400
 401                         if (m->control_pressed) {  delete convert;  return 0;  }
 402
 403                         distfile = convert->getOutputFile();
 404
 405                         //if no names file given with phylip file, create it
 406                         ListVector* listToMakeNameFile =  convert->getListVector();
 407                         if ((namefile == "") && (countfile == "")) {  //you need to make a namefile for split matrix
 408                                 ofstream out;
 409                                 namefile = phylipfile + ".names";
 410                                 m->openOutputFile(namefile, out);
 411                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
 412                                         string bin = listToMakeNameFile->get(i);
 413                                         out << bin << '\t' << bin << endl;
 414                                 }
 415                                 out.close();
 416                         }
 417                         delete listToMakeNameFile;
 418                         delete convert;
 419
 420                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
 421                 }
 422                 if (m->control_pressed) { return 0; }
 423
 424                 estart = time(NULL);
 425                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
 426
 427                 //split matrix into non-overlapping groups
 428                 SplitMatrix* split;
 429                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large);                                                    }
 430                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large);                                    }
 431                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);  }
 432                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
 433
 434                 split->split();
 435
 436                 if (m->control_pressed) { delete split; return 0; }
 437
 438                 singletonName = split->getSingletonNames();
 439                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
 440                 delete split;
 441
 442                 //output a merged distance file
 443                 if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
 444
 445
 446                 if (m->control_pressed) { return 0; }
 447
 448                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
 449                 estart = time(NULL);
 450
 451         if (!runCluster) {
 452
 453                 m->mothurOutEndLine();
 454                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 455                 for (int i = 0; i < distName.size(); i++) {     m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine();      }
 456                 m->mothurOutEndLine();
 457                 return 0;
 458
 459         }
 460
 461                 //****************** break up files between processes and cluster each file set ******************************//
 462         #ifdef USE_MPI
 463                         ////you are process 0 from above////
 464
 465                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 466                         dividedNames.resize(processors);
 467
 468                         //for each file group figure out which process will complete it
 469                         //want to divide the load intelligently so the big files are spread between processes
 470                         for (int i = 0; i < distName.size(); i++) {
 471                                 int processToAssign = (i+1) % processors;
 472                                 if (processToAssign == 0) { processToAssign = processors; }
 473
 474                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 475                         }
 476
 477                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 478                         for (int i = 0; i < processors; i++) {
 479                                 int remainder = ((i+1) % processors);
 480                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 481                         }
 482
 483
 484                         //send each child the list of files it needs to process
 485                         for(int i = 1; i < processors; i++) {
 486                                 //send number of file pairs
 487                                 int num = dividedNames[i].size();
 488                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 489
 490                                 for (int j = 0; j < num; j++) { //send filenames to process i
 491                                         char tempDistFileName[1024];
 492                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
 493                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
 494
 495                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 496                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 497
 498                                         char tempNameFileName[1024];
 499                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
 500                                         int lengthName = (dividedNames[i][j].begin()->second).length();
 501
 502                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 503                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 504                                 }
 505                         }
 506
 507                         //process your share
 508                         listFileNames = cluster(dividedNames[0], labels);
 509
 510                         //receive the other processes info
 511                         for(int i = 1; i < processors; i++) {
 512                                 int num = dividedNames[i].size();
 513
 514                                 double tempCutoff;
 515                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
 516                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
 517
 518                                 //send list filenames to root process
 519                                 for (int j = 0; j < num; j++) {
 520                                         int lengthList = 0;
 521                                         char tempListFileName[1024];
 522
 523                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 524                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 525
 526                                         string myListFileName = tempListFileName;
 527                                         myListFileName = myListFileName.substr(0, lengthList);
 528
 529                                         listFileNames.push_back(myListFileName);
 530                                 }
 531
 532                                 //send Labels to root process
 533                                 int numLabels = 0;
 534                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 535
 536                                 for (int j = 0; j < numLabels; j++) {
 537                                         int lengthLabel = 0;
 538                                         char tempLabel[100];
 539
 540                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 541                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 542
 543                                         string myLabel = tempLabel;
 544                                         myLabel = myLabel.substr(0, lengthLabel);
 545
 546                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
 547                                 }
 548                         }
 549
 550                 }else { //you are a child process
 551                         vector < map<string, string> >  myNames;
 552
 553                         //recieve the files you need to process
 554                         //receive number of file pairs
 555                         int num = 0;
 556                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 557
 558                         myNames.resize(num);
 559
 560                         for (int j = 0; j < num; j++) { //receive filenames to process
 561                                 int lengthDist = 0;
 562                                 char tempDistFileName[1024];
 563
 564                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 565                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 566
 567                                 string myDistFileName = tempDistFileName;
 568                                 myDistFileName = myDistFileName.substr(0, lengthDist);
 569
 570                                 int lengthName = 0;
 571                                 char tempNameFileName[1024];
 572
 573                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 574                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 575
 576                                 string myNameFileName = tempNameFileName;
 577                                 myNameFileName = myNameFileName.substr(0, lengthName);
 578
 579                                 //save file name
 580                                 myNames[j][myDistFileName] = myNameFileName;
 581                         }
 582
 583                         //process them
 584                         listFileNames = cluster(myNames, labels);
 585
 586                         //send cutoff
 587                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
 588
 589                         //send list filenames to root process
 590                         for (int j = 0; j < num; j++) {
 591                                 char tempListFileName[1024];
 592                                 strcpy(tempListFileName, listFileNames[j].c_str());
 593                                 int lengthList = listFileNames[j].length();
 594
 595                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 596                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 597                         }
 598
 599                         //send Labels to root process
 600                         int numLabels = labels.size();
 601                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 602
 603                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
 604                                 char tempLabel[100];
 605                                 strcpy(tempLabel, (*it).c_str());
 606                                 int lengthLabel = (*it).length();
 607
 608                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 609                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 610                         }
 611                 }
 612
 613                 //make everyone wait
 614                 MPI_Barrier(MPI_COMM_WORLD);
 615
 616         #else
 617                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
 618                 //sanity check
 619                 if (processors > distName.size()) { processors = distName.size(); }
 620
 621                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 622                                 if(processors == 1){
 623                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 624                                 }else{
 625                                         listFileNames = createProcesses(distName, labels);
 626                 }
 627                 #else
 628                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 629                 #endif
 630         #endif
 631                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
 632
 633                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
 634
 635                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
 636
 637                 //****************** merge list file and create rabund and sabund files ******************************//
 638                 estart = time(NULL);
 639                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
 640
 641                 #ifdef USE_MPI
 642                         if (pid == 0) { //only process 0 merges
 643                 #endif
 644
 645                 ListVector* listSingle;
 646                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 647
 648                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 649
 650                 mergeLists(listFileNames, labelBins, listSingle);
 651
 652                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 653
 654                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 655
 656                 //set list file as new current listfile
 657                 string current = "";
 658                 itTypes = outputTypes.find("list");
 659                 if (itTypes != outputTypes.end()) {
 660                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
 661                 }
 662
 663                 //set rabund file as new current rabundfile
 664                 itTypes = outputTypes.find("rabund");
 665                 if (itTypes != outputTypes.end()) {
 666                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
 667                 }
 668
 669                 //set sabund file as new current sabundfile
 670                 itTypes = outputTypes.find("sabund");
 671                 if (itTypes != outputTypes.end()) {
 672                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
 673                 }
 674
 675                 m->mothurOutEndLine();
 676                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 677                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
 678                 m->mothurOutEndLine();
 679
 680                 #ifdef USE_MPI
 681                         } //only process 0 merges
 682
 683                         //make everyone wait
 684                         MPI_Barrier(MPI_COMM_WORLD);
 685                 #endif
 686
 687                 return 0;
 688         }
 689         catch(exception& e) {
 690                 m->errorOut(e, "ClusterSplitCommand", "execute");
 691                 exit(1);
 692         }
 693 }
 694 //**********************************************************************************************************************
 695 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
 696         try {
 697
 698                 map<float, int> labelBin;
 699                 vector<float> orderFloat;
 700                 int numSingleBins;
 701
 702                 //read in singletons
 703                 if (singleton != "none") {
 704
 705             ifstream in;
 706             m->openInputFile(singleton, in);
 707
 708                         string firstCol, secondCol;
 709                         listSingle = new ListVector();
 710
 711             if (countfile != "") { m->getline(in); m->gobble(in); }
 712
 713                         while (!in.eof()) {
 714                                 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
 715                                 if (countfile == "") { listSingle->push_back(secondCol); }
 716                 else { listSingle->push_back(firstCol); }
 717                         }
 718
 719                         in.close();
 720                         m->mothurRemove(singleton);
 721
 722                         numSingleBins = listSingle->getNumBins();
 723                 }else{  listSingle = NULL; numSingleBins = 0;  }
 724
 725                 //go through users set and make them floats so we can sort them
 726                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
 727                         float temp = -10.0;
 728
 729                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
 730                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
 731
 732                         if (temp <= cutoff) {
 733                                 orderFloat.push_back(temp);
 734                                 labelBin[temp] = numSingleBins; //initialize numbins
 735                         }
 736                 }
 737
 738                 //sort order
 739                 sort(orderFloat.begin(), orderFloat.end());
 740                 userLabels.clear();
 741
 742                 //get the list info from each file
 743                 for (int k = 0; k < listNames.size(); k++) {
 744
 745                         if (m->control_pressed) {
 746                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
 747                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
 748                                 return labelBin;
 749                         }
 750
 751                         InputData* input = new InputData(listNames[k], "list");
 752                         ListVector* list = input->getListVector();
 753                         string lastLabel = list->getLabel();
 754
 755                         string filledInList = listNames[k] + "filledInTemp";
 756                         ofstream outFilled;
 757                         m->openOutputFile(filledInList, outFilled);
 758
 759                         //for each label needed
 760                         for(int l = 0; l < orderFloat.size(); l++){
 761
 762                                 string thisLabel;
 763                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
 764                                 else { thisLabel = toString(orderFloat[l],  length-1);  }
 765
 766                                 //this file has reached the end
 767                                 if (list == NULL) {
 768                                         list = input->getListVector(lastLabel, true);
 769                                 }else{  //do you have the distance, or do you need to fill in
 770
 771                                         float labelFloat;
 772                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
 773                                         else { convert(list->getLabel(), labelFloat); }
 774
 775                                         //check for missing labels
 776                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
 777                                                 //if its bigger get last label, otherwise keep it
 778                                                 delete list;
 779                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
 780                                         }
 781                                         lastLabel = list->getLabel();
 782                                 }
 783
 784                                 //print to new file
 785                                 list->setLabel(thisLabel);
 786                                 list->print(outFilled);
 787
 788                                 //update labelBin
 789                                 labelBin[orderFloat[l]] += list->getNumBins();
 790
 791                                 delete list;
 792
 793                                 list = input->getListVector();
 794                         }
 795
 796                         if (list != NULL) { delete list; }
 797                         delete input;
 798
 799                         outFilled.close();
 800                         m->mothurRemove(listNames[k]);
 801                         rename(filledInList.c_str(), listNames[k].c_str());
 802                 }
 803
 804                 return labelBin;
 805         }
 806         catch(exception& e) {
 807                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
 808                 exit(1);
 809         }
 810 }
 811 //**********************************************************************************************************************
 812 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
 813         try {
 814                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
 815                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
 816
 817         map<string, string> variables;
 818         variables["[filename]"] = fileroot;
 819         variables["[clustertag]"] = tag;
 820         string sabundFileName = getOutputFileName("sabund", variables);
 821         string rabundFileName = getOutputFileName("rabund", variables);
 822         if (countfile != "") { variables["[tag2]"] = "unique_list"; }
 823         string listFileName = getOutputFileName("list", variables);
 824
 825         if (countfile == "") {
 826             m->openOutputFile(sabundFileName,   outSabund);
 827             m->openOutputFile(rabundFileName,   outRabund);
 828             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
 829             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
 830
 831         }
 832                 m->openOutputFile(listFileName, outList);
 833         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
 834
 835
 836                 map<float, int>::iterator itLabel;
 837
 838                 //for each label needed
 839                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
 840
 841                         string thisLabel;
 842                         if (itLabel->first == -1) { thisLabel = "unique"; }
 843                         else { thisLabel = toString(itLabel->first,  length-1);  }
 844
 845                         outList << thisLabel << '\t' << itLabel->second << '\t';
 846
 847             RAbundVector* rabund = NULL;
 848             if (countfile == "") {
 849                 rabund = new RAbundVector();
 850                 rabund->setLabel(thisLabel);
 851             }
 852
 853                         //add in singletons
 854                         if (listSingle != NULL) {
 855                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
 856                                         outList << listSingle->get(j) << '\t';
 857                                         if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
 858                                 }
 859                         }
 860
 861                         //get the list info from each file
 862                         for (int k = 0; k < listNames.size(); k++) {
 863
 864                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } if (rabund != NULL) { delete rabund; } return 0; }
 865
 866                                 InputData* input = new InputData(listNames[k], "list");
 867                                 ListVector* list = input->getListVector(thisLabel);
 868
 869                                 //this file has reached the end
 870                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }
 871                                 else {
 872                                         for (int j = 0; j < list->getNumBins(); j++) {
 873                                                 outList << list->get(j) << '\t';
 874                                                 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
 875                                         }
 876                                         delete list;
 877                                 }
 878                                 delete input;
 879                         }
 880
 881             if (countfile == "") {
 882                 SAbundVector sabund = rabund->getSAbundVector();
 883                 sabund.print(outSabund);
 884                 rabund->print(outRabund);
 885             }
 886                         outList << endl;
 887
 888                         if (rabund != NULL) { delete rabund; }
 889                 }
 890
 891                 outList.close();
 892         if (countfile == "") {
 893             outRabund.close();
 894             outSabund.close();
 895                 }
 896                 if (listSingle != NULL) { delete listSingle;  }
 897
 898                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
 899
 900                 return 0;
 901         }
 902         catch(exception& e) {
 903                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
 904                 exit(1);
 905         }
 906 }
 907
 908 //**********************************************************************************************************************
 909
 910 void ClusterSplitCommand::printData(ListVector* oldList){
 911         try {
 912                 string label = oldList->getLabel();
 913                 RAbundVector oldRAbund = oldList->getRAbundVector();
 914
 915                 oldRAbund.setLabel(label);
 916                 if (m->isTrue(showabund)) {
 917                         oldRAbund.getSAbundVector().print(cout);
 918                 }
 919                 oldRAbund.print(outRabund);
 920                 oldRAbund.getSAbundVector().print(outSabund);
 921
 922                 oldList->print(outList);
 923         }
 924         catch(exception& e) {
 925                 m->errorOut(e, "ClusterSplitCommand", "printData");
 926                 exit(1);
 927         }
 928 }
 929 //**********************************************************************************************************************
 930 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
 931         try {
 932
 933         vector<string> listFiles;
 934         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 935         dividedNames.resize(processors);
 936
 937         //for each file group figure out which process will complete it
 938         //want to divide the load intelligently so the big files are spread between processes
 939         for (int i = 0; i < distName.size(); i++) {
 940             //cout << i << endl;
 941             int processToAssign = (i+1) % processors;
 942             if (processToAssign == 0) { processToAssign = processors; }
 943
 944             dividedNames[(processToAssign-1)].push_back(distName[i]);
 945             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
 946         }
 947
 948         //not lets reverse the order of ever other process, so we balance big files running with little ones
 949         for (int i = 0; i < processors; i++) {
 950             //cout << i << endl;
 951             int remainder = ((i+1) % processors);
 952             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 953         }
 954
 955         if (m->control_pressed) { return listFiles; }
 956
 957         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 958                 int process = 1;
 959                 processIDS.clear();
 960
 961                 //loop through and create all the processes you want
 962                 while (process != processors) {
 963                         int pid = fork();
 964
 965                         if (pid > 0) {
 966                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 967                                 process++;
 968                         }else if (pid == 0){
 969                                 set<string> labels;
 970                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
 971
 972                                 //write out names to file
 973                                 string filename = toString(getpid()) + ".temp";
 974                                 ofstream out;
 975                                 m->openOutputFile(filename, out);
 976                                 out << tag << endl;
 977                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
 978                                 out.close();
 979
 980                                 //print out labels
 981                                 ofstream outLabels;
 982                                 filename = toString(getpid()) + ".temp.labels";
 983                                 m->openOutputFile(filename, outLabels);
 984
 985                                 outLabels << cutoff << endl;
 986                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
 987                                         outLabels << (*it) << endl;
 988                                 }
 989                                 outLabels.close();
 990
 991                                 exit(0);
 992                         }else {
 993                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
 994                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
 995                                 exit(0);
 996                         }
 997                 }
 998
 999         //do your part
1000         listFiles = cluster(dividedNames[0], labels);
1001
1002                 //force parent to wait until all the processes are done
1003                 for (int i=0;i< processIDS.size();i++) {
1004                         int temp = processIDS[i];
1005                         wait(&temp);
1006                 }
1007
1008         //get list of list file names from each process
1009         for(int i=0;i<processIDS.size();i++){
1010             string filename = toString(processIDS[i]) + ".temp";
1011             ifstream in;
1012             m->openInputFile(filename, in);
1013
1014             in >> tag; m->gobble(in);
1015
1016             while(!in.eof()) {
1017                 string tempName;
1018                 in >> tempName; m->gobble(in);
1019                 listFiles.push_back(tempName);
1020             }
1021             in.close();
1022             m->mothurRemove((toString(processIDS[i]) + ".temp"));
1023
1024             //get labels
1025             filename = toString(processIDS[i]) + ".temp.labels";
1026             ifstream in2;
1027             m->openInputFile(filename, in2);
1028
1029             float tempCutoff;
1030             in2 >> tempCutoff; m->gobble(in2);
1031             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1032
1033             while(!in2.eof()) {
1034                 string tempName;
1035                 in2 >> tempName; m->gobble(in2);
1036                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1037             }
1038             in2.close();
1039             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1040         }
1041
1042
1043     #else
1044
1045         //////////////////////////////////////////////////////////////////////////////////////////////////////
1046                 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
1047                 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1048                 //Taking advantage of shared memory to allow both threads to add labels.
1049                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1050                 /*
1051                 vector<clusterData*> pDataArray;
1052                 DWORD   dwThreadIdArray[processors-1];
1053                 HANDLE  hThreadArray[processors-1];
1054
1055                 //Create processor worker threads.
1056                 for( int i=1; i<processors; i++ ){
1057                         // Allocate memory for thread data.
1058                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1059                         pDataArray.push_back(tempCluster);
1060                         processIDS.push_back(i);
1061
1062                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1063                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1064                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
1065
1066                 }
1067
1068         //do your part
1069         listFiles = cluster(dividedNames[0], labels);
1070
1071                 //Wait until all threads have terminated.
1072                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1073
1074                 //Close all thread handles and free memory allocations.
1075                 for(int i=0; i < pDataArray.size(); i++){
1076             //get tag
1077             tag = pDataArray[i]->tag;
1078             //get listfiles created
1079             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1080             //get labels
1081             set<string>::iterator it;
1082             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1083                         //check cutoff
1084             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1085                         CloseHandle(hThreadArray[i]);
1086                         delete pDataArray[i];
1087                 }
1088 */
1089         #endif
1090
1091         return listFiles;
1092
1093         }
1094         catch(exception& e) {
1095                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1096                 exit(1);
1097         }
1098 }
1099 //**********************************************************************************************************************
1100
1101 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1102         try {
1103
1104                 vector<string> listFileNames;
1105                 double smallestCutoff = cutoff;
1106
1107                 //cluster each distance file
1108                 for (int i = 0; i < distNames.size(); i++) {
1109
1110                         string thisNamefile = distNames[i].begin()->second;
1111                         string thisDistFile = distNames[i].begin()->first;
1112
1113                         string listFileName = "";
1114             if (classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff);   }
1115             else            {  listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff);          }
1116
1117                         if (m->control_pressed) { //clean up
1118                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1119                                 listFileNames.clear(); return listFileNames;
1120                         }
1121
1122             listFileNames.push_back(listFileName);
1123         }
1124
1125                 cutoff = smallestCutoff;
1126
1127                 return listFileNames;
1128
1129         }
1130         catch(exception& e) {
1131                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1132                 exit(1);
1133         }
1134
1135
1136 }
1137 //**********************************************************************************************************************
1138 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1139         try {
1140         string listFileName = "";
1141
1142         ListVector* list = NULL;
1143         ListVector oldList;
1144         RAbundVector* rabund = NULL;
1145
1146 #ifdef USE_MPI
1147         int pid;
1148         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1149
1150         //output your files too
1151         if (pid != 0) {
1152             cout << endl << "Reading " << thisDistFile << endl;
1153         }
1154 #endif
1155
1156         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1157
1158         //reads phylip file storing data in 2D vector, also fills list and rabund
1159         bool sim = false;
1160                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1161
1162         NameAssignment* nameMap = NULL;
1163         CountTable* ct = NULL;
1164         if(namefile != ""){
1165                         nameMap = new NameAssignment(thisNamefile);
1166                         nameMap->readMap();
1167             cluster->readPhylipFile(thisDistFile, nameMap);
1168                 }else if (countfile != "") {
1169             ct = new CountTable();
1170             ct->readTable(thisNamefile);
1171             cluster->readPhylipFile(thisDistFile, ct);
1172         }
1173         tag = cluster->getTag();
1174
1175                 if (m->control_pressed) { if(namefile != ""){   delete nameMap; }
1176             else { delete ct; } delete cluster; return 0; }
1177
1178                 list = cluster->getListVector();
1179                 rabund = cluster->getRAbundVector();
1180
1181                 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1182                 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1183         listFileName = fileroot+ tag + ".list";
1184
1185         ofstream listFile;
1186                 m->openOutputFile(fileroot+ tag + ".list",      listFile);
1187
1188                 float previousDist = 0.00000;
1189                 float rndPreviousDist = 0.00000;
1190                 oldList = *list;
1191
1192 #ifdef USE_MPI
1193         //output your files too
1194         if (pid != 0) {
1195             cout << endl << "Clustering " << thisDistFile << endl;
1196         }
1197 #endif
1198
1199         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1200
1201                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1202                         if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  if(namefile != ""){    delete nameMap; }
1203                 else { delete ct; } return listFileName;  }
1204
1205                         cluster->update(cutoff);
1206
1207                         float dist = cluster->getSmallDist();
1208                         float rndDist;
1209                         if (hard) {
1210                                 rndDist = m->ceilDist(dist, precision);
1211                         }else{
1212                                 rndDist = m->roundDist(dist, precision);
1213                         }
1214
1215             if(previousDist <= 0.0000 && dist != previousDist){
1216                 oldList.setLabel("unique");
1217                 oldList.print(listFile);
1218                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1219             }
1220             else if(rndDist != rndPreviousDist){
1221                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1222                 oldList.print(listFile);
1223                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1224             }
1225
1226
1227                         previousDist = dist;
1228                         rndPreviousDist = rndDist;
1229                         oldList = *list;
1230                 }
1231
1232                 if(previousDist <= 0.0000){
1233             oldList.setLabel("unique");
1234             oldList.print(listFile);
1235             if (labels.count("unique") == 0) { labels.insert("unique"); }
1236         }
1237         else if(rndPreviousDist<cutoff){
1238             oldList.setLabel(toString(rndPreviousDist,  length-1));
1239             oldList.print(listFile);
1240             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1241         }
1242
1243
1244                 listFile.close();
1245
1246                 delete cluster;  delete list; delete rabund;
1247         if(namefile != ""){     delete nameMap; }
1248         else { delete ct; }
1249
1250         m->mothurRemove(thisDistFile);
1251         m->mothurRemove(thisNamefile);
1252
1253         return listFileName;
1254
1255         }
1256         catch(exception& e) {
1257                 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1258                 exit(1);
1259         }
1260 }
1261
1262 //**********************************************************************************************************************
1263 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1264         try {
1265         string listFileName = "";
1266
1267         Cluster* cluster = NULL;
1268         SparseDistanceMatrix* matrix = NULL;
1269         ListVector* list = NULL;
1270         ListVector oldList;
1271         RAbundVector* rabund = NULL;
1272
1273         if (m->control_pressed) { return listFileName; }
1274
1275 #ifdef USE_MPI
1276         int pid;
1277         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1278
1279         //output your files too
1280         if (pid != 0) {
1281             cout << endl << "Reading " << thisDistFile << endl;
1282         }
1283 #endif
1284
1285         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1286
1287         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1288         read->setCutoff(cutoff);
1289
1290         NameAssignment* nameMap = NULL;
1291         CountTable* ct = NULL;
1292                 if(namefile != ""){
1293                         nameMap = new NameAssignment(thisNamefile);
1294                         nameMap->readMap();
1295             read->read(nameMap);
1296                 }else if (countfile != "") {
1297             ct = new CountTable();
1298             ct->readTable(thisNamefile);
1299             read->read(ct);
1300         }else { read->read(nameMap); }
1301
1302                 list = read->getListVector();
1303         oldList = *list;
1304                 matrix = read->getDMatrix();
1305
1306                 if(countfile != "") {
1307             rabund = new RAbundVector();
1308             createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1309             delete ct;
1310         }else { rabund = new RAbundVector(list->getRAbundVector()); }
1311
1312         delete read;  read = NULL;
1313         if (namefile != "") { delete nameMap; nameMap = NULL; }
1314
1315
1316 #ifdef USE_MPI
1317         //output your files too
1318         if (pid != 0) {
1319             cout << endl << "Clustering " << thisDistFile << endl;
1320         }
1321 #endif
1322
1323         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1324
1325         //create cluster
1326         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1327         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1328         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
1329         tag = cluster->getTag();
1330
1331         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1332         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1333
1334         ofstream listFile;
1335         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1336
1337         listFileName = fileroot+ tag + ".list";
1338
1339         float previousDist = 0.00000;
1340         float rndPreviousDist = 0.00000;
1341
1342         oldList = *list;
1343
1344         print_start = true;
1345         start = time(NULL);
1346         double saveCutoff = cutoff;
1347
1348         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1349
1350             if (m->control_pressed) { //clean up
1351                 delete matrix; delete list;     delete cluster; delete rabund;
1352                 listFile.close();
1353                 m->mothurRemove(listFileName);
1354                 return listFileName;
1355             }
1356
1357             cluster->update(saveCutoff);
1358
1359             float dist = matrix->getSmallDist();
1360             float rndDist;
1361             if (hard) {
1362                 rndDist = m->ceilDist(dist, precision);
1363             }else{
1364                 rndDist = m->roundDist(dist, precision);
1365             }
1366
1367             if(previousDist <= 0.0000 && dist != previousDist){
1368                 oldList.setLabel("unique");
1369                 oldList.print(listFile);
1370                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1371             }
1372             else if(rndDist != rndPreviousDist){
1373                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1374                 oldList.print(listFile);
1375                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1376             }
1377
1378             previousDist = dist;
1379             rndPreviousDist = rndDist;
1380             oldList = *list;
1381         }
1382
1383
1384         if(previousDist <= 0.0000){
1385             oldList.setLabel("unique");
1386             oldList.print(listFile);
1387             if (labels.count("unique") == 0) { labels.insert("unique"); }
1388         }
1389         else if(rndPreviousDist<cutoff){
1390             oldList.setLabel(toString(rndPreviousDist,  length-1));
1391             oldList.print(listFile);
1392             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1393         }
1394
1395         delete matrix; delete list;     delete cluster; delete rabund;
1396         matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1397         listFile.close();
1398
1399         if (m->control_pressed) { //clean up
1400             m->mothurRemove(listFileName);
1401             return listFileName;
1402         }
1403
1404         m->mothurRemove(thisDistFile);
1405         m->mothurRemove(thisNamefile);
1406
1407         if (saveCutoff != cutoff) {
1408             if (hard)   {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1409             else                {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1410
1411             m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1412         }
1413
1414         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1415
1416         return listFileName;
1417
1418         }
1419         catch(exception& e) {
1420                 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1421                 exit(1);
1422         }
1423 }
1424 //**********************************************************************************************************************
1425
1426 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1427         try{
1428
1429 #ifdef USE_MPI
1430                 int pid;
1431                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1432
1433                 if (pid != 0) {
1434 #endif
1435
1436                 string thisOutputDir = outputDir;
1437                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1438         map<string, string> variables;
1439         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
1440                 string outputFileName = getOutputFileName("column", variables);
1441                 m->mothurRemove(outputFileName);
1442
1443
1444                 for (int i = 0; i < distNames.size(); i++) {
1445                         if (m->control_pressed) {  return 0; }
1446
1447                         string thisDistFile = distNames[i].begin()->first;
1448
1449                         m->appendFiles(thisDistFile, outputFileName);
1450                 }
1451
1452                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1453
1454 #ifdef USE_MPI
1455                 }
1456 #endif
1457
1458                 return 0;
1459
1460
1461         }
1462         catch(exception& e) {
1463                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1464                 exit(1);
1465         }
1466 }
1467 //**********************************************************************************************************************
1468 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1469     try {
1470         rabund->setLabel(list->getLabel());
1471         for(int i = 0; i < list->getNumBins(); i++) {
1472             if (m->control_pressed) { break; }
1473             vector<string> binNames;
1474             string bin = list->get(i);
1475             m->splitAtComma(bin, binNames);
1476             int total = 0;
1477             for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
1478             rabund->push_back(total);
1479         }
1480         return 0;
1481     }
1482     catch(exception& e) {
1483                 m->errorOut(e, "ClusterCommand", "createRabund");
1484                 exit(1);
1485         }
1486
1487 }
1488 //**********************************************************************************************************************