clustersplitcommand.cpp

   1 /*
   2  *  clustersplitcommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 5/19/10.
   6  *  Copyright 2010 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "clustersplitcommand.h"
  11 #include "readcluster.h"
  12 #include "splitmatrix.h"
  13 #include "readphylip.h"
  14 #include "readcolumn.h"
  15 #include "readmatrix.hpp"
  16 #include "inputdata.h"
  17
  18
  19 //**********************************************************************************************************************
  20 vector<string> ClusterSplitCommand::getValidParameters(){
  21         try {
  22                 string AlignArray[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
  23                 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
  24                 return myArray;
  25         }
  26         catch(exception& e) {
  27                 m->errorOut(e, "ClusterSplitCommand", "getValidParameters");
  28                 exit(1);
  29         }
  30 }
  31 //**********************************************************************************************************************
  32 ClusterSplitCommand::ClusterSplitCommand(){
  33         try {
  34                 //initialize outputTypes
  35                 vector<string> tempOutNames;
  36                 outputTypes["list"] = tempOutNames;
  37                 outputTypes["rabund"] = tempOutNames;
  38                 outputTypes["sabund"] = tempOutNames;
  39         }
  40         catch(exception& e) {
  41                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
  42                 exit(1);
  43         }
  44 }
  45 //**********************************************************************************************************************
  46 vector<string> ClusterSplitCommand::getRequiredParameters(){
  47         try {
  48                 string Array[] =  {"fasta","phylip","column","or"};
  49                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
  50                 return myArray;
  51         }
  52         catch(exception& e) {
  53                 m->errorOut(e, "ClusterSplitCommand", "getRequiredParameters");
  54                 exit(1);
  55         }
  56 }
  57 //**********************************************************************************************************************
  58 vector<string> ClusterSplitCommand::getRequiredFiles(){
  59         try {
  60                 vector<string> myArray;
  61                 return myArray;
  62         }
  63         catch(exception& e) {
  64                 m->errorOut(e, "ClusterSplitCommand", "getRequiredFiles");
  65                 exit(1);
  66         }
  67 }
  68 //**********************************************************************************************************************
  69 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
  70 ClusterSplitCommand::ClusterSplitCommand(string option)  {
  71         try{
  72                 globaldata = GlobalData::getInstance();
  73                 abort = false;
  74                 format = "";
  75
  76                 //allow user to run help
  77                 if(option == "help") { help(); abort = true; }
  78
  79                 else {
  80                         //valid paramters for this command
  81                         string Array[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
  82                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
  83
  84                         OptionParser parser(option);
  85                         map<string,string> parameters = parser.getParameters();
  86
  87                         ValidParameters validParameter("cluster.split");
  88
  89                         //check to make sure all parameters are valid for command
  90                         map<string,string>::iterator it;
  91                         for (it = parameters.begin(); it != parameters.end(); it++) {
  92                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
  93                                         abort = true;
  94                                 }
  95                         }
  96
  97                         //initialize outputTypes
  98                         vector<string> tempOutNames;
  99                         outputTypes["list"] = tempOutNames;
 100                         outputTypes["rabund"] = tempOutNames;
 101                         outputTypes["sabund"] = tempOutNames;
 102
 103                         globaldata->newRead();
 104
 105                         //if the user changes the output directory command factory will send this info to us in the output parameter
 106                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
 107
 108                                 //if the user changes the input directory command factory will send this info to us in the output parameter
 109                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
 110                         if (inputDir == "not found"){   inputDir = "";          }
 111                         else {
 112                                 string path;
 113                                 it = parameters.find("phylip");
 114                                 //user has given a template file
 115                                 if(it != parameters.end()){
 116                                         path = m->hasPath(it->second);
 117                                         //if the user has not given a path then, add inputdir. else leave path alone.
 118                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
 119                                 }
 120
 121                                 it = parameters.find("column");
 122                                 //user has given a template file
 123                                 if(it != parameters.end()){
 124                                         path = m->hasPath(it->second);
 125                                         //if the user has not given a path then, add inputdir. else leave path alone.
 126                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
 127                                 }
 128
 129                                 it = parameters.find("name");
 130                                 //user has given a template file
 131                                 if(it != parameters.end()){
 132                                         path = m->hasPath(it->second);
 133                                         //if the user has not given a path then, add inputdir. else leave path alone.
 134                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
 135                                 }
 136
 137                                 it = parameters.find("taxonomy");
 138                                 //user has given a template file
 139                                 if(it != parameters.end()){
 140                                         path = m->hasPath(it->second);
 141                                         //if the user has not given a path then, add inputdir. else leave path alone.
 142                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
 143                                 }
 144
 145                                 it = parameters.find("fasta");
 146                                 //user has given a template file
 147                                 if(it != parameters.end()){
 148                                         path = m->hasPath(it->second);
 149                                         //if the user has not given a path then, add inputdir. else leave path alone.
 150                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
 151                                 }
 152                         }
 153
 154                         //check for required parameters
 155                         phylipfile = validParameter.validFile(parameters, "phylip", true);
 156                         if (phylipfile == "not open") { abort = true; }
 157                         else if (phylipfile == "not found") { phylipfile = ""; }
 158                         else {  distfile = phylipfile;  format = "phylip";      }
 159
 160                         columnfile = validParameter.validFile(parameters, "column", true);
 161                         if (columnfile == "not open") { abort = true; }
 162                         else if (columnfile == "not found") { columnfile = ""; }
 163                         else {  distfile = columnfile; format = "column";       }
 164
 165                         namefile = validParameter.validFile(parameters, "name", true);
 166                         if (namefile == "not open") { abort = true; }
 167                         else if (namefile == "not found") { namefile = ""; }
 168
 169                         fastafile = validParameter.validFile(parameters, "fasta", true);
 170                         if (fastafile == "not open") { abort = true; }
 171                         else if (fastafile == "not found") { fastafile = ""; }
 172                         else { distfile = fastafile;  splitmethod = "fasta";  }
 173
 174                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
 175                         if (taxFile == "not open") { abort = true; }
 176                         else if (taxFile == "not found") { taxFile = ""; }
 177
 178                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
 179                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
 180
 181                         if (columnfile != "") {
 182                                 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
 183                         }
 184
 185                         if (fastafile != "") {
 186                                 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
 187                                 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
 188                         }
 189
 190                         //check for optional parameter and set defaults
 191                         // ...at some point should added some additional type checking...
 192                         //get user cutoff and precision or use defaults
 193                         string temp;
 194                         temp = validParameter.validFile(parameters, "precision", false);
 195                         if (temp == "not found") { temp = "100"; }
 196                         //saves precision legnth for formatting below
 197                         length = temp.length();
 198                         convert(temp, precision);
 199
 200                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
 201                         hard = m->isTrue(temp);
 202
 203                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
 204                         large = m->isTrue(temp);
 205
 206                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
 207                         convert(temp, processors);
 208
 209                         temp = validParameter.validFile(parameters, "splitmethod", false);
 210                         if (splitmethod != "fasta") {
 211                                 if (temp == "not found")  { splitmethod = "distance"; }
 212                                 else {  splitmethod = temp; }
 213                         }
 214
 215                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
 216                         convert(temp, cutoff);
 217                         cutoff += (5 / (precision * 10.0));
 218
 219                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
 220                         convert(temp, taxLevelCutoff);
 221
 222                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
 223
 224                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
 225                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 226
 227                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
 228                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
 229
 230                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
 231
 232                         showabund = validParameter.validFile(parameters, "showabund", false);
 233                         if (showabund == "not found") { showabund = "T"; }
 234
 235                         timing = validParameter.validFile(parameters, "timing", false);
 236                         if (timing == "not found") { timing = "F"; }
 237
 238                 }
 239         }
 240         catch(exception& e) {
 241                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 242                 exit(1);
 243         }
 244 }
 245
 246 //**********************************************************************************************************************
 247
 248 void ClusterSplitCommand::help(){
 249         try {
 250                 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
 251                 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
 252                 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
 253                 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n");
 254                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n");
 255                 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n");
 256                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
 257                 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
 258                 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
 259                 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
 260                 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
 261                 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
 262                 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
 263                 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
 264                 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
 265                 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n");
 266                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
 267                 #ifdef USE_MPI
 268                 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
 269                 #endif
 270                 m->mothurOut("The cluster.split command should be in the following format: \n");
 271                 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
 272                 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
 273
 274         }
 275         catch(exception& e) {
 276                 m->errorOut(e, "ClusterSplitCommand", "help");
 277                 exit(1);
 278         }
 279 }
 280
 281 //**********************************************************************************************************************
 282
 283 ClusterSplitCommand::~ClusterSplitCommand(){}
 284
 285 //**********************************************************************************************************************
 286
 287 int ClusterSplitCommand::execute(){
 288         try {
 289
 290                 if (abort == true) {    return 0;       }
 291
 292                 time_t estart;
 293                 vector<string> listFileNames;
 294                 set<string> labels;
 295                 string singletonName = "";
 296                 double saveCutoff = cutoff;
 297
 298                 //****************** file prep work ******************************//
 299                 #ifdef USE_MPI
 300                         int pid;
 301                         int tag = 2001;
 302                         MPI_Status status;
 303                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
 304                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 305
 306                         if (pid == 0) { //only process 0 converts and splits
 307
 308                 #endif
 309
 310                 //if user gave a phylip file convert to column file
 311                 if (format == "phylip") {
 312                         estart = time(NULL);
 313                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
 314
 315                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
 316
 317                         NameAssignment* nameMap = NULL;
 318                         convert->setFormat("phylip");
 319                         convert->read(nameMap);
 320
 321                         if (m->control_pressed) {  delete convert;  return 0;  }
 322
 323                         distfile = convert->getOutputFile();
 324
 325                         //if no names file given with phylip file, create it
 326                         ListVector* listToMakeNameFile =  convert->getListVector();
 327                         if (namefile == "") {  //you need to make a namefile for split matrix
 328                                 ofstream out;
 329                                 namefile = phylipfile + ".names";
 330                                 m->openOutputFile(namefile, out);
 331                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
 332                                         string bin = listToMakeNameFile->get(i);
 333                                         out << bin << '\t' << bin << endl;
 334                                 }
 335                                 out.close();
 336                         }
 337                         delete listToMakeNameFile;
 338                         delete convert;
 339
 340                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
 341                 }
 342                 if (m->control_pressed) { return 0; }
 343
 344                 estart = time(NULL);
 345                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
 346
 347                 //split matrix into non-overlapping groups
 348                 SplitMatrix* split;
 349                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
 350                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
 351                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir);      }
 352                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
 353
 354                 split->split();
 355
 356                 if (m->control_pressed) { delete split; return 0; }
 357
 358                 singletonName = split->getSingletonNames();
 359                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
 360                 delete split;
 361
 362                 if (m->control_pressed) { return 0; }
 363
 364                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
 365                 estart = time(NULL);
 366
 367                 //****************** break up files between processes and cluster each file set ******************************//
 368         #ifdef USE_MPI
 369                         ////you are process 0 from above////
 370
 371                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 372                         dividedNames.resize(processors);
 373
 374                         //for each file group figure out which process will complete it
 375                         //want to divide the load intelligently so the big files are spread between processes
 376                         int count = 1;
 377                         for (int i = 0; i < distName.size(); i++) {
 378                                 int processToAssign = (i+1) % processors;
 379                                 if (processToAssign == 0) { processToAssign = processors; }
 380
 381                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 382                         }
 383
 384                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 385                         for (int i = 0; i < processors; i++) {
 386                                 int remainder = ((i+1) % processors);
 387                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 388                         }
 389
 390
 391                         //send each child the list of files it needs to process
 392                         for(int i = 1; i < processors; i++) {
 393                                 //send number of file pairs
 394                                 int num = dividedNames[i].size();
 395                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 396
 397                                 for (int j = 0; j < num; j++) { //send filenames to process i
 398                                         char tempDistFileName[1024];
 399                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
 400                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
 401
 402                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 403                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 404
 405                                         char tempNameFileName[1024];
 406                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
 407                                         int lengthName = (dividedNames[i][j].begin()->second).length();
 408
 409                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 410                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 411                                 }
 412                         }
 413
 414                         //process your share
 415                         listFileNames = cluster(dividedNames[0], labels);
 416
 417                         //receive the other processes info
 418                         for(int i = 1; i < processors; i++) {
 419                                 int num = dividedNames[i].size();
 420
 421                                 double tempCutoff;
 422                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
 423                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
 424
 425                                 //send list filenames to root process
 426                                 for (int j = 0; j < num; j++) {
 427                                         int lengthList = 0;
 428                                         char tempListFileName[1024];
 429
 430                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 431                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 432
 433                                         string myListFileName = tempListFileName;
 434                                         myListFileName = myListFileName.substr(0, lengthList);
 435
 436                                         listFileNames.push_back(myListFileName);
 437                                 }
 438
 439                                 //send Labels to root process
 440                                 int numLabels = 0;
 441                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 442
 443                                 for (int j = 0; j < numLabels; j++) {
 444                                         int lengthLabel = 0;
 445                                         char tempLabel[100];
 446
 447                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 448                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 449
 450                                         string myLabel = tempLabel;
 451                                         myLabel = myLabel.substr(0, lengthLabel);
 452
 453                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
 454                                 }
 455                         }
 456
 457                 }else { //you are a child process
 458                         vector < map<string, string> >  myNames;
 459
 460                         //recieve the files you need to process
 461                         //receive number of file pairs
 462                         int num = 0;
 463                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 464
 465                         myNames.resize(num);
 466
 467                         for (int j = 0; j < num; j++) { //receive filenames to process
 468                                 int lengthDist = 0;
 469                                 char tempDistFileName[1024];
 470
 471                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 472                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 473
 474                                 string myDistFileName = tempDistFileName;
 475                                 myDistFileName = myDistFileName.substr(0, lengthDist);
 476
 477                                 int lengthName = 0;
 478                                 char tempNameFileName[1024];
 479
 480                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 481                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 482
 483                                 string myNameFileName = tempNameFileName;
 484                                 myNameFileName = myNameFileName.substr(0, lengthName);
 485
 486                                 //save file name
 487                                 myNames[j][myDistFileName] = myNameFileName;
 488                         }
 489
 490                         //process them
 491                         listFileNames = cluster(myNames, labels);
 492
 493                         //send cutoff
 494                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
 495
 496                         //send list filenames to root process
 497                         for (int j = 0; j < num; j++) {
 498                                 char tempListFileName[1024];
 499                                 strcpy(tempListFileName, listFileNames[j].c_str());
 500                                 int lengthList = listFileNames[j].length();
 501
 502                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 503                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 504                         }
 505
 506                         //send Labels to root process
 507                         int numLabels = labels.size();
 508                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 509
 510                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
 511                                 char tempLabel[100];
 512                                 strcpy(tempLabel, (*it).c_str());
 513                                 int lengthLabel = (*it).length();
 514
 515                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 516                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 517                         }
 518                 }
 519
 520                 //make everyone wait
 521                 MPI_Barrier(MPI_COMM_WORLD);
 522
 523         #else
 524
 525                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 526                                 if(processors == 1){
 527                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 528                                 }else{
 529                                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 530                                         dividedNames.resize(processors);
 531
 532                                         //for each file group figure out which process will complete it
 533                                         //want to divide the load intelligently so the big files are spread between processes
 534                                         int count = 1;
 535                                         for (int i = 0; i < distName.size(); i++) {
 536                                                 int processToAssign = (i+1) % processors;
 537                                                 if (processToAssign == 0) { processToAssign = processors; }
 538
 539                                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 540                                         }
 541
 542                                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 543                                         for (int i = 0; i < processors; i++) {
 544                                                 int remainder = ((i+1) % processors);
 545                                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 546                                         }
 547
 548                                         createProcesses(dividedNames);
 549
 550                                         if (m->control_pressed) { return 0; }
 551
 552                                         //get list of list file names from each process
 553                                         for(int i=0;i<processors;i++){
 554                                                 string filename = toString(processIDS[i]) + ".temp";
 555                                                 ifstream in;
 556                                                 m->openInputFile(filename, in);
 557
 558                                                 in >> tag; m->gobble(in);
 559
 560                                                 while(!in.eof()) {
 561                                                         string tempName;
 562                                                         in >> tempName; m->gobble(in);
 563                                                         listFileNames.push_back(tempName);
 564                                                 }
 565                                                 in.close();
 566                                                 remove((toString(processIDS[i]) + ".temp").c_str());
 567
 568                                                 //get labels
 569                                                 filename = toString(processIDS[i]) + ".temp.labels";
 570                                                 ifstream in2;
 571                                                 m->openInputFile(filename, in2);
 572
 573                                                 float tempCutoff;
 574                                                 in2 >> tempCutoff; m->gobble(in2);
 575                                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
 576
 577                                                 while(!in2.eof()) {
 578                                                         string tempName;
 579                                                         in2 >> tempName; m->gobble(in2);
 580                                                         if (labels.count(tempName) == 0) { labels.insert(tempName); }
 581                                                 }
 582                                                 in2.close();
 583                                                 remove((toString(processIDS[i]) + ".temp.labels").c_str());
 584                                         }
 585                                 }
 586                 #else
 587                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 588                 #endif
 589         #endif
 590                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
 591
 592                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
 593
 594                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
 595
 596                 //****************** merge list file and create rabund and sabund files ******************************//
 597                 estart = time(NULL);
 598                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
 599
 600                 #ifdef USE_MPI
 601                         if (pid == 0) { //only process 0 merges
 602                 #endif
 603
 604                 ListVector* listSingle;
 605                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 606
 607                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
 608
 609                 mergeLists(listFileNames, labelBins, listSingle);
 610
 611                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
 612
 613                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 614
 615                 m->mothurOutEndLine();
 616                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 617                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
 618                 m->mothurOutEndLine();
 619
 620                 #ifdef USE_MPI
 621                         } //only process 0 merges
 622
 623                         //make everyone wait
 624                         MPI_Barrier(MPI_COMM_WORLD);
 625                 #endif
 626
 627                 return 0;
 628         }
 629         catch(exception& e) {
 630                 m->errorOut(e, "ClusterSplitCommand", "execute");
 631                 exit(1);
 632         }
 633 }
 634 //**********************************************************************************************************************
 635 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
 636         try {
 637
 638                 map<float, int> labelBin;
 639                 vector<float> orderFloat;
 640                 int numSingleBins;
 641
 642                 //read in singletons
 643                 if (singleton != "none") {
 644                         ifstream in;
 645                         m->openInputFile(singleton, in);
 646
 647                         string firstCol, secondCol;
 648                         listSingle = new ListVector();
 649                         while (!in.eof()) {
 650                                 in >> firstCol >> secondCol; m->gobble(in);
 651                                 listSingle->push_back(secondCol);
 652                         }
 653                         in.close();
 654                         remove(singleton.c_str());
 655
 656                         numSingleBins = listSingle->getNumBins();
 657                 }else{  listSingle = NULL; numSingleBins = 0;  }
 658
 659                 //go through users set and make them floats so we can sort them
 660                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
 661                         float temp = -10.0;
 662
 663                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
 664                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
 665
 666                         if (temp <= cutoff) {
 667                                 orderFloat.push_back(temp);
 668                                 labelBin[temp] = numSingleBins; //initialize numbins
 669                         }
 670                 }
 671
 672                 //sort order
 673                 sort(orderFloat.begin(), orderFloat.end());
 674                 userLabels.clear();
 675
 676                 //get the list info from each file
 677                 for (int k = 0; k < listNames.size(); k++) {
 678
 679                         if (m->control_pressed) {
 680                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
 681                                 for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
 682                                 return labelBin;
 683                         }
 684
 685                         InputData* input = new InputData(listNames[k], "list");
 686                         ListVector* list = input->getListVector();
 687                         string lastLabel = list->getLabel();
 688
 689                         string filledInList = listNames[k] + "filledInTemp";
 690                         ofstream outFilled;
 691                         m->openOutputFile(filledInList, outFilled);
 692
 693                         //for each label needed
 694                         for(int l = 0; l < orderFloat.size(); l++){
 695
 696                                 string thisLabel;
 697                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
 698                                 else { thisLabel = toString(orderFloat[l],  length-1);  }
 699
 700                                 //this file has reached the end
 701                                 if (list == NULL) {
 702                                         list = input->getListVector(lastLabel, true);
 703                                 }else{  //do you have the distance, or do you need to fill in
 704
 705                                         float labelFloat;
 706                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
 707                                         else { convert(list->getLabel(), labelFloat); }
 708
 709                                         //check for missing labels
 710                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
 711                                                 //if its bigger get last label, otherwise keep it
 712                                                 delete list;
 713                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
 714                                         }
 715                                         lastLabel = list->getLabel();
 716                                 }
 717
 718                                 //print to new file
 719                                 list->setLabel(thisLabel);
 720                                 list->print(outFilled);
 721
 722                                 //update labelBin
 723                                 labelBin[orderFloat[l]] += list->getNumBins();
 724
 725                                 delete list;
 726
 727                                 list = input->getListVector();
 728                         }
 729
 730                         if (list != NULL) { delete list; }
 731                         delete input;
 732
 733                         outFilled.close();
 734                         remove(listNames[k].c_str());
 735                         rename(filledInList.c_str(), listNames[k].c_str());
 736                 }
 737
 738                 return labelBin;
 739         }
 740         catch(exception& e) {
 741                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
 742                 exit(1);
 743         }
 744 }
 745 //**********************************************************************************************************************
 746 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
 747         try {
 748                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
 749                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
 750
 751                 m->openOutputFile(fileroot+ tag + ".sabund",    outSabund);
 752                 m->openOutputFile(fileroot+ tag + ".rabund",    outRabund);
 753                 m->openOutputFile(fileroot+ tag + ".list",              outList);
 754
 755                 outputNames.push_back(fileroot+ tag + ".sabund");  outputTypes["list"].push_back(fileroot+ tag + ".list");
 756                 outputNames.push_back(fileroot+ tag + ".rabund");  outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
 757                 outputNames.push_back(fileroot+ tag + ".list");    outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
 758
 759                 map<float, int>::iterator itLabel;
 760
 761                 //for each label needed
 762                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
 763
 764                         string thisLabel;
 765                         if (itLabel->first == -1) { thisLabel = "unique"; }
 766                         else { thisLabel = toString(itLabel->first,  length-1);  }
 767
 768                         outList << thisLabel << '\t' << itLabel->second << '\t';
 769
 770                         RAbundVector* rabund = new RAbundVector();
 771                         rabund->setLabel(thisLabel);
 772
 773                         //add in singletons
 774                         if (listSingle != NULL) {
 775                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
 776                                         outList << listSingle->get(j) << '\t';
 777                                         rabund->push_back(m->getNumNames(listSingle->get(j)));
 778                                 }
 779                         }
 780
 781                         //get the list info from each file
 782                         for (int k = 0; k < listNames.size(); k++) {
 783
 784                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
 785
 786                                 InputData* input = new InputData(listNames[k], "list");
 787                                 ListVector* list = input->getListVector(thisLabel);
 788
 789                                 //this file has reached the end
 790                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }
 791                                 else {
 792                                         for (int j = 0; j < list->getNumBins(); j++) {
 793                                                 outList << list->get(j) << '\t';
 794                                                 rabund->push_back(m->getNumNames(list->get(j)));
 795                                         }
 796                                         delete list;
 797                                 }
 798                                 delete input;
 799                         }
 800
 801                         SAbundVector sabund = rabund->getSAbundVector();
 802
 803                         sabund.print(outSabund);
 804                         rabund->print(outRabund);
 805                         outList << endl;
 806
 807                         delete rabund;
 808                 }
 809
 810                 outList.close();
 811                 outRabund.close();
 812                 outSabund.close();
 813
 814                 if (listSingle != NULL) { delete listSingle;  }
 815
 816                 for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
 817
 818                 return 0;
 819         }
 820         catch(exception& e) {
 821                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
 822                 exit(1);
 823         }
 824 }
 825
 826 //**********************************************************************************************************************
 827
 828 void ClusterSplitCommand::printData(ListVector* oldList){
 829         try {
 830                 string label = oldList->getLabel();
 831                 RAbundVector oldRAbund = oldList->getRAbundVector();
 832
 833                 oldRAbund.setLabel(label);
 834                 if (m->isTrue(showabund)) {
 835                         oldRAbund.getSAbundVector().print(cout);
 836                 }
 837                 oldRAbund.print(outRabund);
 838                 oldRAbund.getSAbundVector().print(outSabund);
 839
 840                 oldList->print(outList);
 841         }
 842         catch(exception& e) {
 843                 m->errorOut(e, "ClusterSplitCommand", "printData");
 844                 exit(1);
 845         }
 846 }
 847 //**********************************************************************************************************************
 848 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
 849         try {
 850
 851         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 852                 int process = 0;
 853                 int exitCommand = 1;
 854                 processIDS.clear();
 855
 856                 //loop through and create all the processes you want
 857                 while (process != processors) {
 858                         int pid = fork();
 859
 860                         if (pid > 0) {
 861                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 862                                 process++;
 863                         }else if (pid == 0){
 864                                 set<string> labels;
 865                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
 866
 867                                 //write out names to file
 868                                 string filename = toString(getpid()) + ".temp";
 869                                 ofstream out;
 870                                 m->openOutputFile(filename, out);
 871                                 out << tag << endl;
 872                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
 873                                 out.close();
 874
 875                                 //print out labels
 876                                 ofstream outLabels;
 877                                 filename = toString(getpid()) + ".temp.labels";
 878                                 m->openOutputFile(filename, outLabels);
 879
 880                                 outLabels << cutoff << endl;
 881                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
 882                                         outLabels << (*it) << endl;
 883                                 }
 884                                 outLabels.close();
 885
 886                                 exit(0);
 887                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
 888                 }
 889
 890                 //force parent to wait until all the processes are done
 891                 for (int i=0;i<processors;i++) {
 892                         int temp = processIDS[i];
 893                         wait(&temp);
 894                 }
 895
 896                 return exitCommand;
 897         #endif
 898
 899         }
 900         catch(exception& e) {
 901                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
 902                 exit(1);
 903         }
 904 }
 905 //**********************************************************************************************************************
 906
 907 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
 908         try {
 909                 Cluster* cluster;
 910                 SparseMatrix* matrix;
 911                 ListVector* list;
 912                 ListVector oldList;
 913                 RAbundVector* rabund;
 914
 915                 vector<string> listFileNames;
 916
 917                 double smallestCutoff = cutoff;
 918
 919                 //cluster each distance file
 920                 for (int i = 0; i < distNames.size(); i++) {
 921                         if (m->control_pressed) { return listFileNames; }
 922
 923                         string thisNamefile = distNames[i].begin()->second;
 924                         string thisDistFile = distNames[i].begin()->first;
 925
 926                         //read in distance file
 927                         globaldata->setNameFile(thisNamefile);
 928                         globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
 929
 930                         #ifdef USE_MPI
 931                                 int pid;
 932                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 933
 934                                 //output your files too
 935                                 if (pid != 0) {
 936                                         cout << endl << "Reading " << thisDistFile << endl;
 937                                 }
 938                         #endif
 939
 940                         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
 941
 942                         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
 943                         read->setCutoff(cutoff);
 944
 945                         NameAssignment* nameMap = new NameAssignment(thisNamefile);
 946                         nameMap->readMap();
 947                         read->read(nameMap);
 948
 949                         if (m->control_pressed) {  delete read; delete nameMap; return listFileNames; }
 950
 951                         list = read->getListVector();
 952                         oldList = *list;
 953                         matrix = read->getMatrix();
 954
 955                         delete read;
 956                         delete nameMap;
 957
 958
 959                         #ifdef USE_MPI
 960                                 //output your files too
 961                                 if (pid != 0) {
 962                                         cout << endl << "Clustering " << thisDistFile << endl;
 963                                 }
 964                         #endif
 965
 966                         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
 967
 968                         rabund = new RAbundVector(list->getRAbundVector());
 969
 970                         //create cluster
 971                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
 972                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
 973                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
 974                         tag = cluster->getTag();
 975
 976                         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
 977                         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
 978
 979                         ofstream listFile;
 980                         m->openOutputFile(fileroot+ tag + ".list",      listFile);
 981
 982                         listFileNames.push_back(fileroot+ tag + ".list");
 983
 984                         time_t estart = time(NULL);
 985
 986                         float previousDist = 0.00000;
 987                         float rndPreviousDist = 0.00000;
 988
 989                         oldList = *list;
 990
 991                         print_start = true;
 992                         start = time(NULL);
 993                         double saveCutoff = cutoff;
 994
 995                         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
 996
 997                                 if (m->control_pressed) { //clean up
 998                                         delete matrix; delete list;     delete cluster; delete rabund;
 999                                         listFile.close();
1000                                         for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
1001                                         listFileNames.clear(); return listFileNames;
1002                                 }
1003
1004                                 cluster->update(saveCutoff);
1005
1006                                 float dist = matrix->getSmallDist();
1007                                 float rndDist;
1008                                 if (hard) {
1009                                         rndDist = m->ceilDist(dist, precision);
1010                                 }else{
1011                                         rndDist = m->roundDist(dist, precision);
1012                                 }
1013
1014                                 if(previousDist <= 0.0000 && dist != previousDist){
1015                                         oldList.setLabel("unique");
1016                                         oldList.print(listFile);
1017                                         if (labels.count("unique") == 0) {  labels.insert("unique");  }
1018                                 }
1019                                 else if(rndDist != rndPreviousDist){
1020                                         oldList.setLabel(toString(rndPreviousDist,  length-1));
1021                                         oldList.print(listFile);
1022                                         if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1023                                 }
1024
1025                                 previousDist = dist;
1026                                 rndPreviousDist = rndDist;
1027                                 oldList = *list;
1028                         }
1029
1030
1031                         if(previousDist <= 0.0000){
1032                                 oldList.setLabel("unique");
1033                                 oldList.print(listFile);
1034                                 if (labels.count("unique") == 0) { labels.insert("unique"); }
1035                         }
1036                         else if(rndPreviousDist<cutoff){
1037                                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1038                                 oldList.print(listFile);
1039                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1040                         }
1041
1042                         delete matrix; delete list;     delete cluster; delete rabund;
1043                         listFile.close();
1044
1045                         if (m->control_pressed) { //clean up
1046                                 for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
1047                                 listFileNames.clear(); return listFileNames;
1048                         }
1049
1050                         remove(thisDistFile.c_str());
1051                         remove(thisNamefile.c_str());
1052
1053                         if (saveCutoff != cutoff) {
1054                                 if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1055                                 else            {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1056
1057                                 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1058                         }
1059
1060                         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1061                 }
1062
1063                 cutoff = smallestCutoff;
1064
1065                 return listFileNames;
1066
1067         }
1068         catch(exception& e) {
1069                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1070                 exit(1);
1071         }
1072
1073
1074 }
1075
1076 //**********************************************************************************************************************