clustersplitcommand.cpp

   1 /*
   2  *  clustersplitcommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 5/19/10.
   6  *  Copyright 2010 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "clustersplitcommand.h"
  11 #include "readcluster.h"
  12 #include "splitmatrix.h"
  13 #include "readphylip.h"
  14 #include "readcolumn.h"
  15 #include "readmatrix.hpp"
  16 #include "inputdata.h"
  17
  18 //**********************************************************************************************************************
  19 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
  20 ClusterSplitCommand::ClusterSplitCommand(string option)  {
  21         try{
  22                 globaldata = GlobalData::getInstance();
  23                 abort = false;
  24                 format = "";
  25
  26                 //allow user to run help
  27                 if(option == "help") { help(); abort = true; }
  28
  29                 else {
  30                         //valid paramters for this command
  31                         string Array[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
  32                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
  33
  34                         OptionParser parser(option);
  35                         map<string,string> parameters = parser.getParameters();
  36
  37                         ValidParameters validParameter;
  38
  39                         //check to make sure all parameters are valid for command
  40                         map<string,string>::iterator it;
  41                         for (it = parameters.begin(); it != parameters.end(); it++) {
  42                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
  43                                         abort = true;
  44                                 }
  45                         }
  46
  47                         globaldata->newRead();
  48
  49                         //if the user changes the output directory command factory will send this info to us in the output parameter
  50                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
  51
  52                                 //if the user changes the input directory command factory will send this info to us in the output parameter
  53                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
  54                         if (inputDir == "not found"){   inputDir = "";          }
  55                         else {
  56                                 string path;
  57                                 it = parameters.find("phylip");
  58                                 //user has given a template file
  59                                 if(it != parameters.end()){
  60                                         path = hasPath(it->second);
  61                                         //if the user has not given a path then, add inputdir. else leave path alone.
  62                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
  63                                 }
  64
  65                                 it = parameters.find("column");
  66                                 //user has given a template file
  67                                 if(it != parameters.end()){
  68                                         path = hasPath(it->second);
  69                                         //if the user has not given a path then, add inputdir. else leave path alone.
  70                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
  71                                 }
  72
  73                                 it = parameters.find("name");
  74                                 //user has given a template file
  75                                 if(it != parameters.end()){
  76                                         path = hasPath(it->second);
  77                                         //if the user has not given a path then, add inputdir. else leave path alone.
  78                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
  79                                 }
  80
  81                                 it = parameters.find("taxonomy");
  82                                 //user has given a template file
  83                                 if(it != parameters.end()){
  84                                         path = hasPath(it->second);
  85                                         //if the user has not given a path then, add inputdir. else leave path alone.
  86                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
  87                                 }
  88
  89                                 it = parameters.find("fasta");
  90                                 //user has given a template file
  91                                 if(it != parameters.end()){
  92                                         path = hasPath(it->second);
  93                                         //if the user has not given a path then, add inputdir. else leave path alone.
  94                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
  95                                 }
  96                         }
  97
  98                         //check for required parameters
  99                         phylipfile = validParameter.validFile(parameters, "phylip", true);
 100                         if (phylipfile == "not open") { abort = true; }
 101                         else if (phylipfile == "not found") { phylipfile = ""; }
 102                         else {  distfile = phylipfile;  format = "phylip";      }
 103
 104                         columnfile = validParameter.validFile(parameters, "column", true);
 105                         if (columnfile == "not open") { abort = true; }
 106                         else if (columnfile == "not found") { columnfile = ""; }
 107                         else {  distfile = columnfile; format = "column";       }
 108
 109                         namefile = validParameter.validFile(parameters, "name", true);
 110                         if (namefile == "not open") { abort = true; }
 111                         else if (namefile == "not found") { namefile = ""; }
 112
 113                         fastafile = validParameter.validFile(parameters, "fasta", true);
 114                         if (fastafile == "not open") { abort = true; }
 115                         else if (fastafile == "not found") { fastafile = ""; }
 116                         else { distfile = fastafile;  splitmethod = "fasta";  }
 117
 118                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
 119                         if (taxFile == "not open") { abort = true; }
 120                         else if (taxFile == "not found") { taxFile = ""; }
 121
 122                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
 123                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
 124
 125                         if (columnfile != "") {
 126                                 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
 127                         }
 128
 129                         if (fastafile != "") {
 130                                 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
 131                                 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
 132                         }
 133
 134                         //check for optional parameter and set defaults
 135                         // ...at some point should added some additional type checking...
 136                         //get user cutoff and precision or use defaults
 137                         string temp;
 138                         temp = validParameter.validFile(parameters, "precision", false);
 139                         if (temp == "not found") { temp = "100"; }
 140                         //saves precision legnth for formatting below
 141                         length = temp.length();
 142                         convert(temp, precision);
 143
 144                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
 145                         hard = isTrue(temp);
 146
 147                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
 148                         large = isTrue(temp);
 149
 150                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
 151                         convert(temp, processors);
 152
 153                         temp = validParameter.validFile(parameters, "splitmethod", false);
 154                         if (splitmethod != "fasta") {
 155                                 if (temp == "not found")  { splitmethod = "distance"; }
 156                                 else {  splitmethod = temp; }
 157                         }
 158
 159                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
 160                         convert(temp, cutoff);
 161                         cutoff += (5 / (precision * 10.0));
 162
 163                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
 164                         convert(temp, taxLevelCutoff);
 165
 166                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
 167
 168                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
 169                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 170
 171                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
 172                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
 173
 174                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
 175
 176                         showabund = validParameter.validFile(parameters, "showabund", false);
 177                         if (showabund == "not found") { showabund = "T"; }
 178
 179                         timing = validParameter.validFile(parameters, "timing", false);
 180                         if (timing == "not found") { timing = "F"; }
 181
 182                 }
 183         }
 184         catch(exception& e) {
 185                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 186                 exit(1);
 187         }
 188 }
 189
 190 //**********************************************************************************************************************
 191
 192 void ClusterSplitCommand::help(){
 193         try {
 194                 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
 195                 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
 196                 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
 197                 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n");
 198                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and split the distance file based on those groups. \n");
 199                 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n");
 200                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
 201                 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
 202                 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
 203                 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
 204                 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
 205                 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
 206                 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
 207                 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
 208                 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
 209                 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n");
 210                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
 211                 #ifdef USE_MPI
 212                 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
 213                 #endif
 214                 m->mothurOut("The cluster.split command should be in the following format: \n");
 215                 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
 216                 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
 217
 218         }
 219         catch(exception& e) {
 220                 m->errorOut(e, "ClusterSplitCommand", "help");
 221                 exit(1);
 222         }
 223 }
 224
 225 //**********************************************************************************************************************
 226
 227 ClusterSplitCommand::~ClusterSplitCommand(){}
 228
 229 //**********************************************************************************************************************
 230
 231 int ClusterSplitCommand::execute(){
 232         try {
 233
 234                 if (abort == true) {    return 0;       }
 235
 236                 time_t estart;
 237                 vector<string> listFileNames;
 238                 set<string> labels;
 239                 string singletonName = "";
 240                 double saveCutoff = cutoff;
 241
 242                 //****************** file prep work ******************************//
 243                 #ifdef USE_MPI
 244                         int pid;
 245                         int tag = 2001;
 246                         MPI_Status status;
 247                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
 248                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 249
 250                         if (pid == 0) { //only process 0 converts and splits
 251
 252                 #endif
 253
 254                 //if user gave a phylip file convert to column file
 255                 if (format == "phylip") {
 256                         estart = time(NULL);
 257                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
 258
 259                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
 260
 261                         NameAssignment* nameMap = NULL;
 262                         convert->setFormat("phylip");
 263                         convert->read(nameMap);
 264
 265                         if (m->control_pressed) {  delete convert;  return 0;  }
 266
 267                         distfile = convert->getOutputFile();
 268
 269                         //if no names file given with phylip file, create it
 270                         ListVector* listToMakeNameFile =  convert->getListVector();
 271                         if (namefile == "") {  //you need to make a namefile for split matrix
 272                                 ofstream out;
 273                                 namefile = phylipfile + ".names";
 274                                 openOutputFile(namefile, out);
 275                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
 276                                         string bin = listToMakeNameFile->get(i);
 277                                         out << bin << '\t' << bin << endl;
 278                                 }
 279                                 out.close();
 280                         }
 281                         delete listToMakeNameFile;
 282                         delete convert;
 283
 284                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
 285                 }
 286                 if (m->control_pressed) { return 0; }
 287
 288                 estart = time(NULL);
 289                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
 290
 291                 //split matrix into non-overlapping groups
 292                 SplitMatrix* split;
 293                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
 294                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
 295                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, splitmethod, processors, outputDir);      }
 296                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
 297
 298                 split->split();
 299
 300                 if (m->control_pressed) { delete split; return 0; }
 301
 302                 singletonName = split->getSingletonNames();
 303                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
 304                 delete split;
 305
 306                 if (m->control_pressed) { return 0; }
 307
 308                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
 309                 estart = time(NULL);
 310
 311                 //****************** break up files between processes and cluster each file set ******************************//
 312         #ifdef USE_MPI
 313                         ////you are process 0 from above////
 314
 315                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 316                         dividedNames.resize(processors);
 317
 318                         //for each file group figure out which process will complete it
 319                         //want to divide the load intelligently so the big files are spread between processes
 320                         int count = 1;
 321                         for (int i = 0; i < distName.size(); i++) {
 322                                 int processToAssign = (i+1) % processors;
 323                                 if (processToAssign == 0) { processToAssign = processors; }
 324
 325                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 326                         }
 327
 328                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 329                         for (int i = 0; i < processors; i++) {
 330                                 int remainder = ((i+1) % processors);
 331                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 332                         }
 333
 334
 335                         //send each child the list of files it needs to process
 336                         for(int i = 1; i < processors; i++) {
 337                                 //send number of file pairs
 338                                 int num = dividedNames[i].size();
 339                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 340
 341                                 for (int j = 0; j < num; j++) { //send filenames to process i
 342                                         char tempDistFileName[1024];
 343                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
 344                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
 345
 346                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 347                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 348
 349                                         char tempNameFileName[1024];
 350                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
 351                                         int lengthName = (dividedNames[i][j].begin()->second).length();
 352
 353                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 354                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 355                                 }
 356                         }
 357
 358                         //process your share
 359                         listFileNames = cluster(dividedNames[0], labels);
 360
 361                         //receive the other processes info
 362                         for(int i = 1; i < processors; i++) {
 363                                 int num = dividedNames[i].size();
 364
 365                                 double tempCutoff;
 366                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
 367                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
 368
 369                                 //send list filenames to root process
 370                                 for (int j = 0; j < num; j++) {
 371                                         int lengthList = 0;
 372                                         char tempListFileName[1024];
 373
 374                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 375                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 376
 377                                         string myListFileName = tempListFileName;
 378                                         myListFileName = myListFileName.substr(0, lengthList);
 379
 380                                         listFileNames.push_back(myListFileName);
 381                                 }
 382
 383                                 //send Labels to root process
 384                                 int numLabels = 0;
 385                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 386
 387                                 for (int j = 0; j < numLabels; j++) {
 388                                         int lengthLabel = 0;
 389                                         char tempLabel[100];
 390
 391                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 392                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 393
 394                                         string myLabel = tempLabel;
 395                                         myLabel = myLabel.substr(0, lengthLabel);
 396
 397                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
 398                                 }
 399                         }
 400
 401                 }else { //you are a child process
 402                         vector < map<string, string> >  myNames;
 403
 404                         //recieve the files you need to process
 405                         //receive number of file pairs
 406                         int num = 0;
 407                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 408
 409                         myNames.resize(num);
 410
 411                         for (int j = 0; j < num; j++) { //receive filenames to process
 412                                 int lengthDist = 0;
 413                                 char tempDistFileName[1024];
 414
 415                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 416                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 417
 418                                 string myDistFileName = tempDistFileName;
 419                                 myDistFileName = myDistFileName.substr(0, lengthDist);
 420
 421                                 int lengthName = 0;
 422                                 char tempNameFileName[1024];
 423
 424                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 425                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 426
 427                                 string myNameFileName = tempNameFileName;
 428                                 myNameFileName = myNameFileName.substr(0, lengthName);
 429
 430                                 //save file name
 431                                 myNames[j][myDistFileName] = myNameFileName;
 432                         }
 433
 434                         //process them
 435                         listFileNames = cluster(myNames, labels);
 436
 437                         //send cutoff
 438                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
 439
 440                         //send list filenames to root process
 441                         for (int j = 0; j < num; j++) {
 442                                 char tempListFileName[1024];
 443                                 strcpy(tempListFileName, listFileNames[j].c_str());
 444                                 int lengthList = listFileNames[j].length();
 445
 446                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 447                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 448                         }
 449
 450                         //send Labels to root process
 451                         int numLabels = labels.size();
 452                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 453
 454                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
 455                                 char tempLabel[100];
 456                                 strcpy(tempLabel, (*it).c_str());
 457                                 int lengthLabel = (*it).length();
 458
 459                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 460                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 461                         }
 462                 }
 463
 464                 //make everyone wait
 465                 MPI_Barrier(MPI_COMM_WORLD);
 466
 467         #else
 468
 469                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 470                                 if(processors == 1){
 471                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 472                                 }else{
 473                                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 474                                         dividedNames.resize(processors);
 475
 476                                         //for each file group figure out which process will complete it
 477                                         //want to divide the load intelligently so the big files are spread between processes
 478                                         int count = 1;
 479                                         for (int i = 0; i < distName.size(); i++) {
 480                                                 int processToAssign = (i+1) % processors;
 481                                                 if (processToAssign == 0) { processToAssign = processors; }
 482
 483                                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 484                                         }
 485
 486                                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 487                                         for (int i = 0; i < processors; i++) {
 488                                                 int remainder = ((i+1) % processors);
 489                                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 490                                         }
 491
 492                                         createProcesses(dividedNames);
 493
 494                                         if (m->control_pressed) { return 0; }
 495
 496                                         //get list of list file names from each process
 497                                         for(int i=0;i<processors;i++){
 498                                                 string filename = toString(processIDS[i]) + ".temp";
 499                                                 ifstream in;
 500                                                 openInputFile(filename, in);
 501
 502                                                 in >> tag; gobble(in);
 503
 504                                                 while(!in.eof()) {
 505                                                         string tempName;
 506                                                         in >> tempName; gobble(in);
 507                                                         listFileNames.push_back(tempName);
 508                                                 }
 509                                                 in.close();
 510                                                 remove((toString(processIDS[i]) + ".temp").c_str());
 511
 512                                                 //get labels
 513                                                 filename = toString(processIDS[i]) + ".temp.labels";
 514                                                 ifstream in2;
 515                                                 openInputFile(filename, in2);
 516
 517                                                 float tempCutoff;
 518                                                 in2 >> tempCutoff; gobble(in2);
 519                                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
 520
 521                                                 while(!in2.eof()) {
 522                                                         string tempName;
 523                                                         in2 >> tempName; gobble(in2);
 524                                                         if (labels.count(tempName) == 0) { labels.insert(tempName); }
 525                                                 }
 526                                                 in2.close();
 527                                                 remove((toString(processIDS[i]) + ".temp.labels").c_str());
 528                                         }
 529                                 }
 530                 #else
 531                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 532                 #endif
 533         #endif
 534                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
 535
 536                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
 537
 538                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
 539
 540                 //****************** merge list file and create rabund and sabund files ******************************//
 541                 estart = time(NULL);
 542                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
 543
 544                 #ifdef USE_MPI
 545                         if (pid == 0) { //only process 0 merges
 546                 #endif
 547
 548                 ListVector* listSingle;
 549                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 550
 551                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
 552
 553                 mergeLists(listFileNames, labelBins, listSingle);
 554
 555                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
 556
 557                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 558
 559                 m->mothurOutEndLine();
 560                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 561                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
 562                 m->mothurOutEndLine();
 563
 564                 #ifdef USE_MPI
 565                         } //only process 0 merges
 566
 567                         //make everyone wait
 568                         MPI_Barrier(MPI_COMM_WORLD);
 569                 #endif
 570
 571                 return 0;
 572         }
 573         catch(exception& e) {
 574                 m->errorOut(e, "ClusterSplitCommand", "execute");
 575                 exit(1);
 576         }
 577 }
 578 //**********************************************************************************************************************
 579 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
 580         try {
 581
 582                 map<float, int> labelBin;
 583                 vector<float> orderFloat;
 584                 int numSingleBins;
 585
 586                 //read in singletons
 587                 if (singleton != "none") {
 588                         ifstream in;
 589                         openInputFile(singleton, in);
 590
 591                         string firstCol, secondCol;
 592                         listSingle = new ListVector();
 593                         while (!in.eof()) {
 594                                 in >> firstCol >> secondCol; gobble(in);
 595                                 listSingle->push_back(secondCol);
 596                         }
 597                         in.close();
 598                         remove(singleton.c_str());
 599
 600                         numSingleBins = listSingle->getNumBins();
 601                 }else{  listSingle = NULL; numSingleBins = 0;  }
 602
 603                 //go through users set and make them floats so we can sort them
 604                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
 605                         float temp = -10.0;
 606
 607                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
 608                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
 609
 610                         if (temp <= cutoff) {
 611                                 orderFloat.push_back(temp);
 612                                 labelBin[temp] = numSingleBins; //initialize numbins
 613                         }
 614                 }
 615
 616                 //sort order
 617                 sort(orderFloat.begin(), orderFloat.end());
 618                 userLabels.clear();
 619
 620                 //get the list info from each file
 621                 for (int k = 0; k < listNames.size(); k++) {
 622
 623                         if (m->control_pressed) {
 624                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
 625                                 for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
 626                                 return labelBin;
 627                         }
 628
 629                         InputData* input = new InputData(listNames[k], "list");
 630                         ListVector* list = input->getListVector();
 631                         string lastLabel = list->getLabel();
 632
 633                         string filledInList = listNames[k] + "filledInTemp";
 634                         ofstream outFilled;
 635                         openOutputFile(filledInList, outFilled);
 636
 637                         //for each label needed
 638                         for(int l = 0; l < orderFloat.size(); l++){
 639
 640                                 string thisLabel;
 641                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
 642                                 else { thisLabel = toString(orderFloat[l],  length-1);  }
 643
 644                                 //this file has reached the end
 645                                 if (list == NULL) {
 646                                         list = input->getListVector(lastLabel, true);
 647                                 }else{  //do you have the distance, or do you need to fill in
 648
 649                                         float labelFloat;
 650                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
 651                                         else { convert(list->getLabel(), labelFloat); }
 652
 653                                         //check for missing labels
 654                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
 655                                                 //if its bigger get last label, otherwise keep it
 656                                                 delete list;
 657                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
 658                                         }
 659                                         lastLabel = list->getLabel();
 660                                 }
 661
 662                                 //print to new file
 663                                 list->setLabel(thisLabel);
 664                                 list->print(outFilled);
 665
 666                                 //update labelBin
 667                                 labelBin[orderFloat[l]] += list->getNumBins();
 668
 669                                 delete list;
 670
 671                                 list = input->getListVector();
 672                         }
 673
 674                         if (list != NULL) { delete list; }
 675                         delete input;
 676
 677                         outFilled.close();
 678                         remove(listNames[k].c_str());
 679                         rename(filledInList.c_str(), listNames[k].c_str());
 680                 }
 681
 682                 return labelBin;
 683         }
 684         catch(exception& e) {
 685                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
 686                 exit(1);
 687         }
 688 }
 689 //**********************************************************************************************************************
 690 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
 691         try {
 692                 if (outputDir == "") { outputDir += hasPath(distfile); }
 693                 fileroot = outputDir + getRootName(getSimpleName(distfile));
 694
 695                 openOutputFile(fileroot+ tag + ".sabund",       outSabund);
 696                 openOutputFile(fileroot+ tag + ".rabund",       outRabund);
 697                 openOutputFile(fileroot+ tag + ".list",         outList);
 698
 699                 outputNames.push_back(fileroot+ tag + ".sabund");
 700                 outputNames.push_back(fileroot+ tag + ".rabund");
 701                 outputNames.push_back(fileroot+ tag + ".list");
 702
 703                 map<float, int>::iterator itLabel;
 704
 705                 //for each label needed
 706                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
 707
 708                         string thisLabel;
 709                         if (itLabel->first == -1) { thisLabel = "unique"; }
 710                         else { thisLabel = toString(itLabel->first,  length-1);  }
 711
 712                         outList << thisLabel << '\t' << itLabel->second << '\t';
 713
 714                         RAbundVector* rabund = new RAbundVector();
 715                         rabund->setLabel(thisLabel);
 716
 717                         //add in singletons
 718                         if (listSingle != NULL) {
 719                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
 720                                         outList << listSingle->get(j) << '\t';
 721                                         rabund->push_back(getNumNames(listSingle->get(j)));
 722                                 }
 723                         }
 724
 725                         //get the list info from each file
 726                         for (int k = 0; k < listNames.size(); k++) {
 727
 728                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
 729
 730                                 InputData* input = new InputData(listNames[k], "list");
 731                                 ListVector* list = input->getListVector(thisLabel);
 732
 733                                 //this file has reached the end
 734                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }
 735                                 else {
 736                                         for (int j = 0; j < list->getNumBins(); j++) {
 737                                                 outList << list->get(j) << '\t';
 738                                                 rabund->push_back(getNumNames(list->get(j)));
 739                                         }
 740                                         delete list;
 741                                 }
 742                                 delete input;
 743                         }
 744
 745                         SAbundVector sabund = rabund->getSAbundVector();
 746
 747                         sabund.print(outSabund);
 748                         rabund->print(outRabund);
 749                         outList << endl;
 750
 751                         delete rabund;
 752                 }
 753
 754                 outList.close();
 755                 outRabund.close();
 756                 outSabund.close();
 757
 758                 if (listSingle != NULL) { delete listSingle;  }
 759
 760                 for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
 761
 762                 return 0;
 763         }
 764         catch(exception& e) {
 765                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
 766                 exit(1);
 767         }
 768 }
 769
 770 //**********************************************************************************************************************
 771
 772 void ClusterSplitCommand::printData(ListVector* oldList){
 773         try {
 774                 string label = oldList->getLabel();
 775                 RAbundVector oldRAbund = oldList->getRAbundVector();
 776
 777                 oldRAbund.setLabel(label);
 778                 if (isTrue(showabund)) {
 779                         oldRAbund.getSAbundVector().print(cout);
 780                 }
 781                 oldRAbund.print(outRabund);
 782                 oldRAbund.getSAbundVector().print(outSabund);
 783
 784                 oldList->print(outList);
 785         }
 786         catch(exception& e) {
 787                 m->errorOut(e, "ClusterSplitCommand", "printData");
 788                 exit(1);
 789         }
 790 }
 791 //**********************************************************************************************************************
 792 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
 793         try {
 794
 795         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 796                 int process = 0;
 797                 int exitCommand = 1;
 798                 processIDS.clear();
 799
 800                 //loop through and create all the processes you want
 801                 while (process != processors) {
 802                         int pid = fork();
 803
 804                         if (pid > 0) {
 805                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 806                                 process++;
 807                         }else if (pid == 0){
 808                                 set<string> labels;
 809                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
 810
 811                                 //write out names to file
 812                                 string filename = toString(getpid()) + ".temp";
 813                                 ofstream out;
 814                                 openOutputFile(filename, out);
 815                                 out << tag << endl;
 816                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
 817                                 out.close();
 818
 819                                 //print out labels
 820                                 ofstream outLabels;
 821                                 filename = toString(getpid()) + ".temp.labels";
 822                                 openOutputFile(filename, outLabels);
 823
 824                                 outLabels << cutoff << endl;
 825                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
 826                                         outLabels << (*it) << endl;
 827                                 }
 828                                 outLabels.close();
 829
 830                                 exit(0);
 831                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
 832                 }
 833
 834                 //force parent to wait until all the processes are done
 835                 for (int i=0;i<processors;i++) {
 836                         int temp = processIDS[i];
 837                         wait(&temp);
 838                 }
 839
 840                 return exitCommand;
 841         #endif
 842
 843         }
 844         catch(exception& e) {
 845                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
 846                 exit(1);
 847         }
 848 }
 849 //**********************************************************************************************************************
 850
 851 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
 852         try {
 853                 Cluster* cluster;
 854                 SparseMatrix* matrix;
 855                 ListVector* list;
 856                 ListVector oldList;
 857                 RAbundVector* rabund;
 858
 859                 vector<string> listFileNames;
 860
 861                 double smallestCutoff = cutoff;
 862
 863                 //cluster each distance file
 864                 for (int i = 0; i < distNames.size(); i++) {
 865                         if (m->control_pressed) { return listFileNames; }
 866
 867                         string thisNamefile = distNames[i].begin()->second;
 868                         string thisDistFile = distNames[i].begin()->first;
 869
 870                         //read in distance file
 871                         globaldata->setNameFile(thisNamefile);
 872                         globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
 873
 874                         #ifdef USE_MPI
 875                                 int pid;
 876                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 877
 878                                 //output your files too
 879                                 if (pid != 0) {
 880                                         cout << endl << "Reading " << thisDistFile << endl;
 881                                 }
 882                         #endif
 883
 884                         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
 885
 886                         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
 887                         read->setCutoff(cutoff);
 888
 889                         NameAssignment* nameMap = new NameAssignment(thisNamefile);
 890                         nameMap->readMap();
 891                         read->read(nameMap);
 892
 893                         if (m->control_pressed) {  delete read; delete nameMap; return listFileNames; }
 894
 895                         list = read->getListVector();
 896                         oldList = *list;
 897                         matrix = read->getMatrix();
 898
 899                         delete read;
 900                         delete nameMap;
 901
 902
 903                         #ifdef USE_MPI
 904                                 //output your files too
 905                                 if (pid != 0) {
 906                                         cout << endl << "Clustering " << thisDistFile << endl;
 907                                 }
 908                         #endif
 909
 910                         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
 911
 912                         rabund = new RAbundVector(list->getRAbundVector());
 913
 914                         //create cluster
 915                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
 916                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
 917                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
 918                         tag = cluster->getTag();
 919
 920                         if (outputDir == "") { outputDir += hasPath(thisDistFile); }
 921                         fileroot = outputDir + getRootName(getSimpleName(thisDistFile));
 922
 923                         ofstream listFile;
 924                         openOutputFile(fileroot+ tag + ".list", listFile);
 925
 926                         listFileNames.push_back(fileroot+ tag + ".list");
 927
 928                         time_t estart = time(NULL);
 929
 930                         float previousDist = 0.00000;
 931                         float rndPreviousDist = 0.00000;
 932
 933                         oldList = *list;
 934
 935                         print_start = true;
 936                         start = time(NULL);
 937                         double saveCutoff = cutoff;
 938
 939                         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
 940
 941                                 if (m->control_pressed) { //clean up
 942                                         delete matrix; delete list;     delete cluster; delete rabund;
 943                                         listFile.close();
 944                                         for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
 945                                         listFileNames.clear(); return listFileNames;
 946                                 }
 947
 948                                 cluster->update(saveCutoff);
 949
 950                                 float dist = matrix->getSmallDist();
 951                                 float rndDist;
 952                                 if (hard) {
 953                                         rndDist = ceilDist(dist, precision);
 954                                 }else{
 955                                         rndDist = roundDist(dist, precision);
 956                                 }
 957
 958                                 if(previousDist <= 0.0000 && dist != previousDist){
 959                                         oldList.setLabel("unique");
 960                                         oldList.print(listFile);
 961                                         if (labels.count("unique") == 0) {  labels.insert("unique");  }
 962                                 }
 963                                 else if(rndDist != rndPreviousDist){
 964                                         oldList.setLabel(toString(rndPreviousDist,  length-1));
 965                                         oldList.print(listFile);
 966                                         if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
 967                                 }
 968
 969                                 previousDist = dist;
 970                                 rndPreviousDist = rndDist;
 971                                 oldList = *list;
 972                         }
 973
 974
 975                         if(previousDist <= 0.0000){
 976                                 oldList.setLabel("unique");
 977                                 oldList.print(listFile);
 978                                 if (labels.count("unique") == 0) { labels.insert("unique"); }
 979                         }
 980                         else if(rndPreviousDist<cutoff){
 981                                 oldList.setLabel(toString(rndPreviousDist,  length-1));
 982                                 oldList.print(listFile);
 983                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
 984                         }
 985
 986                         delete matrix; delete list;     delete cluster; delete rabund;
 987                         listFile.close();
 988
 989                         if (m->control_pressed) { //clean up
 990                                 for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
 991                                 listFileNames.clear(); return listFileNames;
 992                         }
 993
 994                         remove(thisDistFile.c_str());
 995                         remove(thisNamefile.c_str());
 996
 997                         if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  }
 998
 999                         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1000                 }
1001
1002                 cutoff = smallestCutoff;
1003
1004                 return listFileNames;
1005
1006         }
1007         catch(exception& e) {
1008                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1009                 exit(1);
1010         }
1011
1012
1013 }
1014
1015 //**********************************************************************************************************************