clustersplitcommand.cpp

   1 /*
   2  *  clustersplitcommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 5/19/10.
   6  *  Copyright 2010 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "clustersplitcommand.h"
  11
  12
  13 //**********************************************************************************************************************
  14 vector<string> ClusterSplitCommand::setParameters(){
  15         try {
  16                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
  17                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
  18                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
  19                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
  20                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
  21                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
  22                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
  23                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
  24                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
  25                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
  26                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
  27                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff);
  28                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
  29                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
  30                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
  31         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pclassic);
  32                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
  33                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
  34
  35                 vector<string> myArray;
  36                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
  37                 return myArray;
  38         }
  39         catch(exception& e) {
  40                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
  41                 exit(1);
  42         }
  43 }
  44 //**********************************************************************************************************************
  45 string ClusterSplitCommand::getHelpString(){
  46         try {
  47                 string helpString = "";
  48                 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n";
  49                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
  50                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
  51                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
  52                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
  53                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
  54                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
  55                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
  56                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
  57                 helpString += "The name parameter allows you to enter your name file and is required if your distance file is in column format. \n";
  58                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
  59                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
  60                 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
  61                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
  62                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
  63                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
  64                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
  65         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic.  It is only valid with splitmethod=fasta. Default=f.\n";
  66 #ifdef USE_MPI
  67                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
  68 #endif
  69                 helpString += "The cluster.split command should be in the following format: \n";
  70                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
  71                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
  72                 return helpString;
  73         }
  74         catch(exception& e) {
  75                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
  76                 exit(1);
  77         }
  78 }
  79 //**********************************************************************************************************************
  80 string ClusterSplitCommand::getOutputFileNameTag(string type, string inputName=""){
  81         try {
  82         string outputFileName = "";
  83                 map<string, vector<string> >::iterator it;
  84
  85         //is this a type this command creates
  86         it = outputTypes.find(type);
  87         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
  88         else {
  89             if (type == "list") {  outputFileName =  "list"; }
  90             else if (type == "rabund") {  outputFileName =  "rabund"; }
  91             else if (type == "sabund") {  outputFileName =  "sabund"; }
  92             else if (type == "column") {  outputFileName =  "dist"; }
  93             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
  94         }
  95         return outputFileName;
  96         }
  97         catch(exception& e) {
  98                 m->errorOut(e, "ClusterSplitCommand", "getOutputFileNameTag");
  99                 exit(1);
 100         }
 101 }
 102 //**********************************************************************************************************************
 103 ClusterSplitCommand::ClusterSplitCommand(){
 104         try {
 105                 abort = true; calledHelp = true;
 106                 setParameters();
 107                 vector<string> tempOutNames;
 108                 outputTypes["list"] = tempOutNames;
 109                 outputTypes["rabund"] = tempOutNames;
 110                 outputTypes["sabund"] = tempOutNames;
 111                 outputTypes["column"] = tempOutNames;
 112         }
 113         catch(exception& e) {
 114                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 115                 exit(1);
 116         }
 117 }
 118 //**********************************************************************************************************************
 119 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
 120 ClusterSplitCommand::ClusterSplitCommand(string option)  {
 121         try{
 122                 abort = false; calledHelp = false;
 123                 format = "";
 124
 125                 //allow user to run help
 126                 if(option == "help") { help(); abort = true; calledHelp = true; }
 127                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
 128
 129                 else {
 130                         vector<string> myArray = setParameters();
 131
 132                         OptionParser parser(option);
 133                         map<string,string> parameters = parser.getParameters();
 134
 135                         ValidParameters validParameter("cluster.split");
 136
 137                         //check to make sure all parameters are valid for command
 138                         map<string,string>::iterator it;
 139                         for (it = parameters.begin(); it != parameters.end(); it++) {
 140                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
 141                                         abort = true;
 142                                 }
 143                         }
 144
 145                         //initialize outputTypes
 146                         vector<string> tempOutNames;
 147                         outputTypes["list"] = tempOutNames;
 148                         outputTypes["rabund"] = tempOutNames;
 149                         outputTypes["sabund"] = tempOutNames;
 150                         outputTypes["column"] = tempOutNames;
 151
 152                         //if the user changes the output directory command factory will send this info to us in the output parameter
 153                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
 154
 155                                 //if the user changes the input directory command factory will send this info to us in the output parameter
 156                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
 157                         if (inputDir == "not found"){   inputDir = "";          }
 158                         else {
 159                                 string path;
 160                                 it = parameters.find("phylip");
 161                                 //user has given a template file
 162                                 if(it != parameters.end()){
 163                                         path = m->hasPath(it->second);
 164                                         //if the user has not given a path then, add inputdir. else leave path alone.
 165                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
 166                                 }
 167
 168                                 it = parameters.find("column");
 169                                 //user has given a template file
 170                                 if(it != parameters.end()){
 171                                         path = m->hasPath(it->second);
 172                                         //if the user has not given a path then, add inputdir. else leave path alone.
 173                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
 174                                 }
 175
 176                                 it = parameters.find("name");
 177                                 //user has given a template file
 178                                 if(it != parameters.end()){
 179                                         path = m->hasPath(it->second);
 180                                         //if the user has not given a path then, add inputdir. else leave path alone.
 181                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
 182                                 }
 183
 184                                 it = parameters.find("taxonomy");
 185                                 //user has given a template file
 186                                 if(it != parameters.end()){
 187                                         path = m->hasPath(it->second);
 188                                         //if the user has not given a path then, add inputdir. else leave path alone.
 189                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
 190                                 }
 191
 192                                 it = parameters.find("fasta");
 193                                 //user has given a template file
 194                                 if(it != parameters.end()){
 195                                         path = m->hasPath(it->second);
 196                                         //if the user has not given a path then, add inputdir. else leave path alone.
 197                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
 198                                 }
 199                         }
 200
 201                         //check for required parameters
 202                         phylipfile = validParameter.validFile(parameters, "phylip", true);
 203                         if (phylipfile == "not open") { abort = true; }
 204                         else if (phylipfile == "not found") { phylipfile = ""; }
 205                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
 206
 207                         columnfile = validParameter.validFile(parameters, "column", true);
 208                         if (columnfile == "not open") { abort = true; }
 209                         else if (columnfile == "not found") { columnfile = ""; }
 210                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
 211
 212                         namefile = validParameter.validFile(parameters, "name", true);
 213                         if (namefile == "not open") { abort = true; }
 214                         else if (namefile == "not found") { namefile = "";  }
 215                         else { m->setNameFile(namefile); }
 216
 217                         fastafile = validParameter.validFile(parameters, "fasta", true);
 218                         if (fastafile == "not open") { abort = true; }
 219                         else if (fastafile == "not found") { fastafile = ""; }
 220                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
 221
 222                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
 223                         if (taxFile == "not open") { taxFile = ""; abort = true; }
 224                         else if (taxFile == "not found") { taxFile = ""; }
 225                         else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
 226
 227                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
 228                                 //is there are current file available for either of these?
 229                                 //give priority to column, then phylip, then fasta
 230                                 columnfile = m->getColumnFile();
 231                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
 232                                 else {
 233                                         phylipfile = m->getPhylipFile();
 234                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
 235                                         else {
 236                                                 fastafile = m->getFastaFile();
 237                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
 238                                                 else {
 239                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
 240                                                         abort = true;
 241                                                 }
 242                                         }
 243                                 }
 244                         }
 245                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
 246
 247                         if (columnfile != "") {
 248                                 if (namefile == "") {
 249                                         namefile = m->getNameFile();
 250                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 251                                         else {
 252                                                 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine();
 253                                                 abort = true;
 254                                         }
 255                                 }
 256                         }
 257
 258                         if (fastafile != "") {
 259                                 if (taxFile == "") {
 260                                         taxFile = m->getTaxonomyFile();
 261                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
 262                                         else {
 263                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
 264                                                 abort = true;
 265                                         }
 266                                 }
 267
 268                                 if (namefile == "") {
 269                                         namefile = m->getNameFile();
 270                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 271                                         else {
 272                                                 m->mothurOut("You need to provide a namefile if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
 273                                                 abort = true;
 274                                         }
 275                                 }
 276                         }
 277
 278                         //check for optional parameter and set defaults
 279                         // ...at some point should added some additional type checking...
 280                         //get user cutoff and precision or use defaults
 281                         string temp;
 282                         temp = validParameter.validFile(parameters, "precision", false);
 283                         if (temp == "not found") { temp = "100"; }
 284                         //saves precision legnth for formatting below
 285                         length = temp.length();
 286                         m->mothurConvert(temp, precision);
 287
 288                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
 289                         hard = m->isTrue(temp);
 290
 291                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
 292                         large = m->isTrue(temp);
 293
 294                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
 295                         m->setProcessors(temp);
 296                         m->mothurConvert(temp, processors);
 297
 298                         temp = validParameter.validFile(parameters, "splitmethod", false);
 299                         if ((splitmethod != "fasta") && (splitmethod != "classify")) {
 300                                 if (temp == "not found")  { splitmethod = "distance"; }
 301                                 else {  splitmethod = temp; }
 302                         }
 303
 304             temp = validParameter.validFile(parameters, "classic", false);                      if (temp == "not found") { temp = "F"; }
 305                         classic = m->isTrue(temp);
 306
 307             if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
 308
 309                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
 310                         m->mothurConvert(temp, cutoff);
 311                         cutoff += (5 / (precision * 10.0));
 312
 313                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
 314                         m->mothurConvert(temp, taxLevelCutoff);
 315
 316                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
 317
 318                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
 319                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 320
 321                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
 322                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
 323
 324                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
 325
 326                         showabund = validParameter.validFile(parameters, "showabund", false);
 327                         if (showabund == "not found") { showabund = "T"; }
 328
 329                         timing = validParameter.validFile(parameters, "timing", false);
 330                         if (timing == "not found") { timing = "F"; }
 331
 332                 }
 333         }
 334         catch(exception& e) {
 335                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 336                 exit(1);
 337         }
 338 }
 339
 340 //**********************************************************************************************************************
 341
 342 int ClusterSplitCommand::execute(){
 343         try {
 344
 345                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
 346
 347                 time_t estart;
 348                 vector<string> listFileNames;
 349                 set<string> labels;
 350                 string singletonName = "";
 351                 double saveCutoff = cutoff;
 352
 353                 //****************** file prep work ******************************//
 354                 #ifdef USE_MPI
 355                         int pid;
 356                         int tag = 2001;
 357                         MPI_Status status;
 358                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
 359                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 360
 361                         if (pid == 0) { //only process 0 converts and splits
 362
 363                 #endif
 364
 365                 //if user gave a phylip file convert to column file
 366                 if (format == "phylip") {
 367                         estart = time(NULL);
 368                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
 369
 370                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
 371
 372                         NameAssignment* nameMap = NULL;
 373                         convert->setFormat("phylip");
 374                         convert->read(nameMap);
 375
 376                         if (m->control_pressed) {  delete convert;  return 0;  }
 377
 378                         distfile = convert->getOutputFile();
 379
 380                         //if no names file given with phylip file, create it
 381                         ListVector* listToMakeNameFile =  convert->getListVector();
 382                         if (namefile == "") {  //you need to make a namefile for split matrix
 383                                 ofstream out;
 384                                 namefile = phylipfile + ".names";
 385                                 m->openOutputFile(namefile, out);
 386                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
 387                                         string bin = listToMakeNameFile->get(i);
 388                                         out << bin << '\t' << bin << endl;
 389                                 }
 390                                 out.close();
 391                         }
 392                         delete listToMakeNameFile;
 393                         delete convert;
 394
 395                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
 396                 }
 397                 if (m->control_pressed) { return 0; }
 398
 399                 estart = time(NULL);
 400                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
 401
 402                 //split matrix into non-overlapping groups
 403                 SplitMatrix* split;
 404                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
 405                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
 406                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);     }
 407                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
 408
 409                 split->split();
 410
 411                 if (m->control_pressed) { delete split; return 0; }
 412
 413                 singletonName = split->getSingletonNames();
 414                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
 415                 delete split;
 416
 417                 //output a merged distance file
 418                 if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
 419
 420
 421                 if (m->control_pressed) { return 0; }
 422
 423                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
 424                 estart = time(NULL);
 425
 426                 //****************** break up files between processes and cluster each file set ******************************//
 427         #ifdef USE_MPI
 428                         ////you are process 0 from above////
 429
 430                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 431                         dividedNames.resize(processors);
 432
 433                         //for each file group figure out which process will complete it
 434                         //want to divide the load intelligently so the big files are spread between processes
 435                         for (int i = 0; i < distName.size(); i++) {
 436                                 int processToAssign = (i+1) % processors;
 437                                 if (processToAssign == 0) { processToAssign = processors; }
 438
 439                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 440                         }
 441
 442                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 443                         for (int i = 0; i < processors; i++) {
 444                                 int remainder = ((i+1) % processors);
 445                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 446                         }
 447
 448
 449                         //send each child the list of files it needs to process
 450                         for(int i = 1; i < processors; i++) {
 451                                 //send number of file pairs
 452                                 int num = dividedNames[i].size();
 453                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 454
 455                                 for (int j = 0; j < num; j++) { //send filenames to process i
 456                                         char tempDistFileName[1024];
 457                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
 458                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
 459
 460                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 461                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 462
 463                                         char tempNameFileName[1024];
 464                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
 465                                         int lengthName = (dividedNames[i][j].begin()->second).length();
 466
 467                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 468                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 469                                 }
 470                         }
 471
 472                         //process your share
 473                         listFileNames = cluster(dividedNames[0], labels);
 474
 475                         //receive the other processes info
 476                         for(int i = 1; i < processors; i++) {
 477                                 int num = dividedNames[i].size();
 478
 479                                 double tempCutoff;
 480                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
 481                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
 482
 483                                 //send list filenames to root process
 484                                 for (int j = 0; j < num; j++) {
 485                                         int lengthList = 0;
 486                                         char tempListFileName[1024];
 487
 488                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 489                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 490
 491                                         string myListFileName = tempListFileName;
 492                                         myListFileName = myListFileName.substr(0, lengthList);
 493
 494                                         listFileNames.push_back(myListFileName);
 495                                 }
 496
 497                                 //send Labels to root process
 498                                 int numLabels = 0;
 499                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 500
 501                                 for (int j = 0; j < numLabels; j++) {
 502                                         int lengthLabel = 0;
 503                                         char tempLabel[100];
 504
 505                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 506                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 507
 508                                         string myLabel = tempLabel;
 509                                         myLabel = myLabel.substr(0, lengthLabel);
 510
 511                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
 512                                 }
 513                         }
 514
 515                 }else { //you are a child process
 516                         vector < map<string, string> >  myNames;
 517
 518                         //recieve the files you need to process
 519                         //receive number of file pairs
 520                         int num = 0;
 521                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 522
 523                         myNames.resize(num);
 524
 525                         for (int j = 0; j < num; j++) { //receive filenames to process
 526                                 int lengthDist = 0;
 527                                 char tempDistFileName[1024];
 528
 529                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 530                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 531
 532                                 string myDistFileName = tempDistFileName;
 533                                 myDistFileName = myDistFileName.substr(0, lengthDist);
 534
 535                                 int lengthName = 0;
 536                                 char tempNameFileName[1024];
 537
 538                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 539                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 540
 541                                 string myNameFileName = tempNameFileName;
 542                                 myNameFileName = myNameFileName.substr(0, lengthName);
 543
 544                                 //save file name
 545                                 myNames[j][myDistFileName] = myNameFileName;
 546                         }
 547
 548                         //process them
 549                         listFileNames = cluster(myNames, labels);
 550
 551                         //send cutoff
 552                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
 553
 554                         //send list filenames to root process
 555                         for (int j = 0; j < num; j++) {
 556                                 char tempListFileName[1024];
 557                                 strcpy(tempListFileName, listFileNames[j].c_str());
 558                                 int lengthList = listFileNames[j].length();
 559
 560                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 561                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 562                         }
 563
 564                         //send Labels to root process
 565                         int numLabels = labels.size();
 566                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 567
 568                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
 569                                 char tempLabel[100];
 570                                 strcpy(tempLabel, (*it).c_str());
 571                                 int lengthLabel = (*it).length();
 572
 573                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 574                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 575                         }
 576                 }
 577
 578                 //make everyone wait
 579                 MPI_Barrier(MPI_COMM_WORLD);
 580
 581         #else
 582                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
 583                 //sanity check
 584                 if (processors > distName.size()) { processors = distName.size(); }
 585
 586                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 587                                 if(processors == 1){
 588                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 589                                 }else{
 590                                         listFileNames = createProcesses(distName, labels);
 591                 }
 592                 #else
 593                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 594                 #endif
 595         #endif
 596                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
 597
 598                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
 599
 600                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
 601
 602                 //****************** merge list file and create rabund and sabund files ******************************//
 603                 estart = time(NULL);
 604                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
 605
 606                 #ifdef USE_MPI
 607                         if (pid == 0) { //only process 0 merges
 608                 #endif
 609
 610                 ListVector* listSingle;
 611                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 612
 613                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 614
 615                 mergeLists(listFileNames, labelBins, listSingle);
 616
 617                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 618
 619                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 620
 621                 //set list file as new current listfile
 622                 string current = "";
 623                 itTypes = outputTypes.find("list");
 624                 if (itTypes != outputTypes.end()) {
 625                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
 626                 }
 627
 628                 //set rabund file as new current rabundfile
 629                 itTypes = outputTypes.find("rabund");
 630                 if (itTypes != outputTypes.end()) {
 631                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
 632                 }
 633
 634                 //set sabund file as new current sabundfile
 635                 itTypes = outputTypes.find("sabund");
 636                 if (itTypes != outputTypes.end()) {
 637                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
 638                 }
 639
 640                 m->mothurOutEndLine();
 641                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 642                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
 643                 m->mothurOutEndLine();
 644
 645                 #ifdef USE_MPI
 646                         } //only process 0 merges
 647
 648                         //make everyone wait
 649                         MPI_Barrier(MPI_COMM_WORLD);
 650                 #endif
 651
 652                 return 0;
 653         }
 654         catch(exception& e) {
 655                 m->errorOut(e, "ClusterSplitCommand", "execute");
 656                 exit(1);
 657         }
 658 }
 659 //**********************************************************************************************************************
 660 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
 661         try {
 662
 663                 map<float, int> labelBin;
 664                 vector<float> orderFloat;
 665                 int numSingleBins;
 666
 667                 //read in singletons
 668                 if (singleton != "none") {
 669                         ifstream in;
 670                         m->openInputFile(singleton, in);
 671
 672                         string firstCol, secondCol;
 673                         listSingle = new ListVector();
 674                         while (!in.eof()) {
 675                                 in >> firstCol >> secondCol; m->gobble(in);
 676                                 listSingle->push_back(secondCol);
 677                         }
 678                         in.close();
 679                         m->mothurRemove(singleton);
 680
 681                         numSingleBins = listSingle->getNumBins();
 682                 }else{  listSingle = NULL; numSingleBins = 0;  }
 683
 684                 //go through users set and make them floats so we can sort them
 685                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
 686                         float temp = -10.0;
 687
 688                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
 689                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
 690
 691                         if (temp <= cutoff) {
 692                                 orderFloat.push_back(temp);
 693                                 labelBin[temp] = numSingleBins; //initialize numbins
 694                         }
 695                 }
 696
 697                 //sort order
 698                 sort(orderFloat.begin(), orderFloat.end());
 699                 userLabels.clear();
 700
 701                 //get the list info from each file
 702                 for (int k = 0; k < listNames.size(); k++) {
 703
 704                         if (m->control_pressed) {
 705                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
 706                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
 707                                 return labelBin;
 708                         }
 709
 710                         InputData* input = new InputData(listNames[k], "list");
 711                         ListVector* list = input->getListVector();
 712                         string lastLabel = list->getLabel();
 713
 714                         string filledInList = listNames[k] + "filledInTemp";
 715                         ofstream outFilled;
 716                         m->openOutputFile(filledInList, outFilled);
 717
 718                         //for each label needed
 719                         for(int l = 0; l < orderFloat.size(); l++){
 720
 721                                 string thisLabel;
 722                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
 723                                 else { thisLabel = toString(orderFloat[l],  length-1);  }
 724
 725                                 //this file has reached the end
 726                                 if (list == NULL) {
 727                                         list = input->getListVector(lastLabel, true);
 728                                 }else{  //do you have the distance, or do you need to fill in
 729
 730                                         float labelFloat;
 731                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
 732                                         else { convert(list->getLabel(), labelFloat); }
 733
 734                                         //check for missing labels
 735                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
 736                                                 //if its bigger get last label, otherwise keep it
 737                                                 delete list;
 738                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
 739                                         }
 740                                         lastLabel = list->getLabel();
 741                                 }
 742
 743                                 //print to new file
 744                                 list->setLabel(thisLabel);
 745                                 list->print(outFilled);
 746
 747                                 //update labelBin
 748                                 labelBin[orderFloat[l]] += list->getNumBins();
 749
 750                                 delete list;
 751
 752                                 list = input->getListVector();
 753                         }
 754
 755                         if (list != NULL) { delete list; }
 756                         delete input;
 757
 758                         outFilled.close();
 759                         m->mothurRemove(listNames[k]);
 760                         rename(filledInList.c_str(), listNames[k].c_str());
 761                 }
 762
 763                 return labelBin;
 764         }
 765         catch(exception& e) {
 766                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
 767                 exit(1);
 768         }
 769 }
 770 //**********************************************************************************************************************
 771 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
 772         try {
 773                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
 774                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
 775
 776         string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund");
 777         string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund");
 778         string listFileName = fileroot+ tag + "." + getOutputFileNameTag("list");
 779
 780                 m->openOutputFile(sabundFileName,       outSabund);
 781                 m->openOutputFile(rabundFileName,       outRabund);
 782                 m->openOutputFile(listFileName, outList);
 783
 784                 outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
 785                 outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
 786                 outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
 787                 map<float, int>::iterator itLabel;
 788
 789                 //for each label needed
 790                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
 791
 792                         string thisLabel;
 793                         if (itLabel->first == -1) { thisLabel = "unique"; }
 794                         else { thisLabel = toString(itLabel->first,  length-1);  }
 795
 796                         outList << thisLabel << '\t' << itLabel->second << '\t';
 797
 798                         RAbundVector* rabund = new RAbundVector();
 799                         rabund->setLabel(thisLabel);
 800
 801                         //add in singletons
 802                         if (listSingle != NULL) {
 803                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
 804                                         outList << listSingle->get(j) << '\t';
 805                                         rabund->push_back(m->getNumNames(listSingle->get(j)));
 806                                 }
 807                         }
 808
 809                         //get the list info from each file
 810                         for (int k = 0; k < listNames.size(); k++) {
 811
 812                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } delete rabund; return 0; }
 813
 814                                 InputData* input = new InputData(listNames[k], "list");
 815                                 ListVector* list = input->getListVector(thisLabel);
 816
 817                                 //this file has reached the end
 818                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }
 819                                 else {
 820                                         for (int j = 0; j < list->getNumBins(); j++) {
 821                                                 outList << list->get(j) << '\t';
 822                                                 rabund->push_back(m->getNumNames(list->get(j)));
 823                                         }
 824                                         delete list;
 825                                 }
 826                                 delete input;
 827                         }
 828
 829                         SAbundVector sabund = rabund->getSAbundVector();
 830
 831                         sabund.print(outSabund);
 832                         rabund->print(outRabund);
 833                         outList << endl;
 834
 835                         delete rabund;
 836                 }
 837
 838                 outList.close();
 839                 outRabund.close();
 840                 outSabund.close();
 841
 842                 if (listSingle != NULL) { delete listSingle;  }
 843
 844                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
 845
 846                 return 0;
 847         }
 848         catch(exception& e) {
 849                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
 850                 exit(1);
 851         }
 852 }
 853
 854 //**********************************************************************************************************************
 855
 856 void ClusterSplitCommand::printData(ListVector* oldList){
 857         try {
 858                 string label = oldList->getLabel();
 859                 RAbundVector oldRAbund = oldList->getRAbundVector();
 860
 861                 oldRAbund.setLabel(label);
 862                 if (m->isTrue(showabund)) {
 863                         oldRAbund.getSAbundVector().print(cout);
 864                 }
 865                 oldRAbund.print(outRabund);
 866                 oldRAbund.getSAbundVector().print(outSabund);
 867
 868                 oldList->print(outList);
 869         }
 870         catch(exception& e) {
 871                 m->errorOut(e, "ClusterSplitCommand", "printData");
 872                 exit(1);
 873         }
 874 }
 875 //**********************************************************************************************************************
 876 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
 877         try {
 878
 879         vector<string> listFiles;
 880         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 881         dividedNames.resize(processors);
 882
 883         //for each file group figure out which process will complete it
 884         //want to divide the load intelligently so the big files are spread between processes
 885         for (int i = 0; i < distName.size(); i++) {
 886             //cout << i << endl;
 887             int processToAssign = (i+1) % processors;
 888             if (processToAssign == 0) { processToAssign = processors; }
 889
 890             dividedNames[(processToAssign-1)].push_back(distName[i]);
 891             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
 892         }
 893
 894         //not lets reverse the order of ever other process, so we balance big files running with little ones
 895         for (int i = 0; i < processors; i++) {
 896             //cout << i << endl;
 897             int remainder = ((i+1) % processors);
 898             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 899         }
 900
 901         if (m->control_pressed) { return listFiles; }
 902
 903         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 904                 int process = 1;
 905                 processIDS.clear();
 906
 907                 //loop through and create all the processes you want
 908                 while (process != processors) {
 909                         int pid = fork();
 910
 911                         if (pid > 0) {
 912                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 913                                 process++;
 914                         }else if (pid == 0){
 915                                 set<string> labels;
 916                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
 917
 918                                 //write out names to file
 919                                 string filename = toString(getpid()) + ".temp";
 920                                 ofstream out;
 921                                 m->openOutputFile(filename, out);
 922                                 out << tag << endl;
 923                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
 924                                 out.close();
 925
 926                                 //print out labels
 927                                 ofstream outLabels;
 928                                 filename = toString(getpid()) + ".temp.labels";
 929                                 m->openOutputFile(filename, outLabels);
 930
 931                                 outLabels << cutoff << endl;
 932                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
 933                                         outLabels << (*it) << endl;
 934                                 }
 935                                 outLabels.close();
 936
 937                                 exit(0);
 938                         }else {
 939                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
 940                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
 941                                 exit(0);
 942                         }
 943                 }
 944
 945         //do your part
 946         listFiles = cluster(dividedNames[0], labels);
 947
 948                 //force parent to wait until all the processes are done
 949                 for (int i=0;i< processIDS.size();i++) {
 950                         int temp = processIDS[i];
 951                         wait(&temp);
 952                 }
 953
 954         //get list of list file names from each process
 955         for(int i=0;i<processIDS.size();i++){
 956             string filename = toString(processIDS[i]) + ".temp";
 957             ifstream in;
 958             m->openInputFile(filename, in);
 959
 960             in >> tag; m->gobble(in);
 961
 962             while(!in.eof()) {
 963                 string tempName;
 964                 in >> tempName; m->gobble(in);
 965                 listFiles.push_back(tempName);
 966             }
 967             in.close();
 968             m->mothurRemove((toString(processIDS[i]) + ".temp"));
 969
 970             //get labels
 971             filename = toString(processIDS[i]) + ".temp.labels";
 972             ifstream in2;
 973             m->openInputFile(filename, in2);
 974
 975             float tempCutoff;
 976             in2 >> tempCutoff; m->gobble(in2);
 977             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
 978
 979             while(!in2.eof()) {
 980                 string tempName;
 981                 in2 >> tempName; m->gobble(in2);
 982                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
 983             }
 984             in2.close();
 985             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
 986         }
 987
 988
 989     #else
 990
 991         //////////////////////////////////////////////////////////////////////////////////////////////////////
 992                 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
 993                 //Above fork() will clone, so memory is separate, but that's not the case with windows,
 994                 //Taking advantage of shared memory to allow both threads to add labels.
 995                 //////////////////////////////////////////////////////////////////////////////////////////////////////
 996
 997                 vector<clusterData*> pDataArray;
 998                 DWORD   dwThreadIdArray[processors-1];
 999                 HANDLE  hThreadArray[processors-1];
1000
1001                 //Create processor worker threads.
1002                 for( int i=1; i<processors; i++ ){
1003                         // Allocate memory for thread data.
1004                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1005                         pDataArray.push_back(tempCluster);
1006                         processIDS.push_back(i);
1007
1008                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1009                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1010                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
1011
1012                 }
1013
1014         //do your part
1015         listFiles = cluster(dividedNames[0], labels);
1016
1017                 //Wait until all threads have terminated.
1018                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1019
1020                 //Close all thread handles and free memory allocations.
1021                 for(int i=0; i < pDataArray.size(); i++){
1022             //get tag
1023             tag = pDataArray[i]->tag;
1024             //get listfiles created
1025             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1026             //get labels
1027             set<string>::iterator it;
1028             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1029                         //check cutoff
1030             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1031                         CloseHandle(hThreadArray[i]);
1032                         delete pDataArray[i];
1033                 }
1034
1035         #endif
1036
1037         return listFiles;
1038
1039         }
1040         catch(exception& e) {
1041                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1042                 exit(1);
1043         }
1044 }
1045 //**********************************************************************************************************************
1046
1047 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1048         try {
1049
1050                 vector<string> listFileNames;
1051                 double smallestCutoff = cutoff;
1052
1053                 //cluster each distance file
1054                 for (int i = 0; i < distNames.size(); i++) {
1055
1056                         string thisNamefile = distNames[i].begin()->second;
1057                         string thisDistFile = distNames[i].begin()->first;
1058
1059                         string listFileName = "";
1060             if (classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff);   }
1061             else            {  listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff);          }
1062
1063                         if (m->control_pressed) { //clean up
1064                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1065                                 listFileNames.clear(); return listFileNames;
1066                         }
1067
1068             listFileNames.push_back(listFileName);
1069         }
1070
1071                 cutoff = smallestCutoff;
1072
1073                 return listFileNames;
1074
1075         }
1076         catch(exception& e) {
1077                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1078                 exit(1);
1079         }
1080
1081
1082 }
1083 //**********************************************************************************************************************
1084 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1085         try {
1086         string listFileName = "";
1087
1088         ListVector* list = NULL;
1089         ListVector oldList;
1090         RAbundVector* rabund = NULL;
1091
1092 #ifdef USE_MPI
1093         int pid;
1094         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1095
1096         //output your files too
1097         if (pid != 0) {
1098             cout << endl << "Reading " << thisDistFile << endl;
1099         }
1100 #endif
1101
1102         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1103
1104         NameAssignment* nameMap = new NameAssignment(thisNamefile);
1105         nameMap->readMap();
1106
1107                 //reads phylip file storing data in 2D vector, also fills list and rabund
1108         bool sim = false;
1109                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1110                 cluster->readPhylipFile(thisDistFile, nameMap);
1111                 tag = cluster->getTag();
1112
1113                 if (m->control_pressed) { delete cluster; return 0; }
1114
1115                 list = cluster->getListVector();
1116                 rabund = cluster->getRAbundVector();
1117
1118                 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1119                 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1120         listFileName = fileroot+ tag + ".list";
1121
1122         ofstream listFile;
1123                 m->openOutputFile(fileroot+ tag + ".list",      listFile);
1124
1125                 float previousDist = 0.00000;
1126                 float rndPreviousDist = 0.00000;
1127                 oldList = *list;
1128
1129 #ifdef USE_MPI
1130         //output your files too
1131         if (pid != 0) {
1132             cout << endl << "Clustering " << thisDistFile << endl;
1133         }
1134 #endif
1135
1136         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1137
1138                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1139                         if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  return listFileName;  }
1140
1141                         cluster->update(cutoff);
1142
1143                         float dist = cluster->getSmallDist();
1144                         float rndDist;
1145                         if (hard) {
1146                                 rndDist = m->ceilDist(dist, precision);
1147                         }else{
1148                                 rndDist = m->roundDist(dist, precision);
1149                         }
1150
1151             if(previousDist <= 0.0000 && dist != previousDist){
1152                 oldList.setLabel("unique");
1153                 oldList.print(listFile);
1154                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1155             }
1156             else if(rndDist != rndPreviousDist){
1157                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1158                 oldList.print(listFile);
1159                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1160             }
1161
1162
1163                         previousDist = dist;
1164                         rndPreviousDist = rndDist;
1165                         oldList = *list;
1166                 }
1167
1168                 if(previousDist <= 0.0000){
1169             oldList.setLabel("unique");
1170             oldList.print(listFile);
1171             if (labels.count("unique") == 0) { labels.insert("unique"); }
1172         }
1173         else if(rndPreviousDist<cutoff){
1174             oldList.setLabel(toString(rndPreviousDist,  length-1));
1175             oldList.print(listFile);
1176             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1177         }
1178
1179
1180                 listFile.close();
1181
1182                 delete cluster; delete nameMap; delete list; delete rabund;
1183
1184
1185         return listFileName;
1186
1187         }
1188         catch(exception& e) {
1189                 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1190                 exit(1);
1191         }
1192 }
1193
1194 //**********************************************************************************************************************
1195 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1196         try {
1197         string listFileName = "";
1198
1199         Cluster* cluster = NULL;
1200         SparseMatrix* matrix = NULL;
1201         ListVector* list = NULL;
1202         ListVector oldList;
1203         RAbundVector* rabund = NULL;
1204
1205         if (m->control_pressed) { return listFileName; }
1206
1207 #ifdef USE_MPI
1208         int pid;
1209         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1210
1211         //output your files too
1212         if (pid != 0) {
1213             cout << endl << "Reading " << thisDistFile << endl;
1214         }
1215 #endif
1216
1217         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1218
1219         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1220         read->setCutoff(cutoff);
1221
1222         NameAssignment* nameMap = new NameAssignment(thisNamefile);
1223         nameMap->readMap();
1224         read->read(nameMap);
1225
1226         if (m->control_pressed) {  delete read; delete nameMap; return listFileName; }
1227
1228         list = read->getListVector();
1229         oldList = *list;
1230         matrix = read->getMatrix();
1231
1232         delete read;  read = NULL;
1233         delete nameMap; nameMap = NULL;
1234
1235
1236 #ifdef USE_MPI
1237         //output your files too
1238         if (pid != 0) {
1239             cout << endl << "Clustering " << thisDistFile << endl;
1240         }
1241 #endif
1242
1243         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1244
1245         rabund = new RAbundVector(list->getRAbundVector());
1246
1247         //create cluster
1248         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1249         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1250         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
1251         tag = cluster->getTag();
1252
1253         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1254         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1255
1256         ofstream listFile;
1257         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1258
1259         listFileName = fileroot+ tag + ".list";
1260
1261         float previousDist = 0.00000;
1262         float rndPreviousDist = 0.00000;
1263
1264         oldList = *list;
1265
1266         print_start = true;
1267         start = time(NULL);
1268         double saveCutoff = cutoff;
1269
1270         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1271
1272             if (m->control_pressed) { //clean up
1273                 delete matrix; delete list;     delete cluster; delete rabund;
1274                 listFile.close();
1275                 m->mothurRemove(listFileName);
1276                 return listFileName;
1277             }
1278
1279             cluster->update(saveCutoff);
1280
1281             float dist = matrix->getSmallDist();
1282             float rndDist;
1283             if (hard) {
1284                 rndDist = m->ceilDist(dist, precision);
1285             }else{
1286                 rndDist = m->roundDist(dist, precision);
1287             }
1288
1289             if(previousDist <= 0.0000 && dist != previousDist){
1290                 oldList.setLabel("unique");
1291                 oldList.print(listFile);
1292                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1293             }
1294             else if(rndDist != rndPreviousDist){
1295                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1296                 oldList.print(listFile);
1297                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1298             }
1299
1300             previousDist = dist;
1301             rndPreviousDist = rndDist;
1302             oldList = *list;
1303         }
1304
1305
1306         if(previousDist <= 0.0000){
1307             oldList.setLabel("unique");
1308             oldList.print(listFile);
1309             if (labels.count("unique") == 0) { labels.insert("unique"); }
1310         }
1311         else if(rndPreviousDist<cutoff){
1312             oldList.setLabel(toString(rndPreviousDist,  length-1));
1313             oldList.print(listFile);
1314             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1315         }
1316
1317         delete matrix; delete list;     delete cluster; delete rabund;
1318         matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1319         listFile.close();
1320
1321         if (m->control_pressed) { //clean up
1322             m->mothurRemove(listFileName);
1323             return listFileName;
1324         }
1325
1326         m->mothurRemove(thisDistFile);
1327         m->mothurRemove(thisNamefile);
1328
1329         if (saveCutoff != cutoff) {
1330             if (hard)   {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1331             else                {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1332
1333             m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1334         }
1335
1336         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1337
1338         return listFileName;
1339
1340         }
1341         catch(exception& e) {
1342                 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1343                 exit(1);
1344         }
1345 }
1346 //**********************************************************************************************************************
1347
1348 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1349         try{
1350
1351 #ifdef USE_MPI
1352                 int pid;
1353                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1354
1355                 if (pid != 0) {
1356 #endif
1357
1358                 string thisOutputDir = outputDir;
1359                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1360                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("column");
1361                 m->mothurRemove(outputFileName);
1362
1363
1364                 for (int i = 0; i < distNames.size(); i++) {
1365                         if (m->control_pressed) {  return 0; }
1366
1367                         string thisDistFile = distNames[i].begin()->first;
1368
1369                         m->appendFiles(thisDistFile, outputFileName);
1370                 }
1371
1372                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1373
1374 #ifdef USE_MPI
1375                 }
1376 #endif
1377
1378                 return 0;
1379
1380
1381         }
1382         catch(exception& e) {
1383                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1384                 exit(1);
1385         }
1386 }
1387 //**********************************************************************************************************************