clustersplitcommand.cpp

   1 /*
   2  *  clustersplitcommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 5/19/10.
   6  *  Copyright 2010 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "clustersplitcommand.h"
  11 #include "readcluster.h"
  12 #include "splitmatrix.h"
  13 #include "readphylip.h"
  14 #include "readcolumn.h"
  15 #include "readmatrix.hpp"
  16 #include "inputdata.h"
  17
  18
  19 //**********************************************************************************************************************
  20 vector<string> ClusterSplitCommand::getValidParameters(){
  21         try {
  22                 string AlignArray[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
  23                 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
  24                 return myArray;
  25         }
  26         catch(exception& e) {
  27                 m->errorOut(e, "ClusterSplitCommand", "getValidParameters");
  28                 exit(1);
  29         }
  30 }
  31 //**********************************************************************************************************************
  32 ClusterSplitCommand::ClusterSplitCommand(){
  33         try {
  34                 abort = true;
  35                 //initialize outputTypes
  36                 vector<string> tempOutNames;
  37                 outputTypes["list"] = tempOutNames;
  38                 outputTypes["rabund"] = tempOutNames;
  39                 outputTypes["sabund"] = tempOutNames;
  40         }
  41         catch(exception& e) {
  42                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
  43                 exit(1);
  44         }
  45 }
  46 //**********************************************************************************************************************
  47 vector<string> ClusterSplitCommand::getRequiredParameters(){
  48         try {
  49                 string Array[] =  {"fasta","phylip","column","or"};
  50                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
  51                 return myArray;
  52         }
  53         catch(exception& e) {
  54                 m->errorOut(e, "ClusterSplitCommand", "getRequiredParameters");
  55                 exit(1);
  56         }
  57 }
  58 //**********************************************************************************************************************
  59 vector<string> ClusterSplitCommand::getRequiredFiles(){
  60         try {
  61                 vector<string> myArray;
  62                 return myArray;
  63         }
  64         catch(exception& e) {
  65                 m->errorOut(e, "ClusterSplitCommand", "getRequiredFiles");
  66                 exit(1);
  67         }
  68 }
  69 //**********************************************************************************************************************
  70 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
  71 ClusterSplitCommand::ClusterSplitCommand(string option)  {
  72         try{
  73                 globaldata = GlobalData::getInstance();
  74                 abort = false;
  75                 format = "";
  76
  77                 //allow user to run help
  78                 if(option == "help") { help(); abort = true; }
  79
  80                 else {
  81                         //valid paramters for this command
  82                         string Array[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
  83                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
  84
  85                         OptionParser parser(option);
  86                         map<string,string> parameters = parser.getParameters();
  87
  88                         ValidParameters validParameter("cluster.split");
  89
  90                         //check to make sure all parameters are valid for command
  91                         map<string,string>::iterator it;
  92                         for (it = parameters.begin(); it != parameters.end(); it++) {
  93                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
  94                                         abort = true;
  95                                 }
  96                         }
  97
  98                         //initialize outputTypes
  99                         vector<string> tempOutNames;
 100                         outputTypes["list"] = tempOutNames;
 101                         outputTypes["rabund"] = tempOutNames;
 102                         outputTypes["sabund"] = tempOutNames;
 103
 104                         globaldata->newRead();
 105
 106                         //if the user changes the output directory command factory will send this info to us in the output parameter
 107                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
 108
 109                                 //if the user changes the input directory command factory will send this info to us in the output parameter
 110                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
 111                         if (inputDir == "not found"){   inputDir = "";          }
 112                         else {
 113                                 string path;
 114                                 it = parameters.find("phylip");
 115                                 //user has given a template file
 116                                 if(it != parameters.end()){
 117                                         path = m->hasPath(it->second);
 118                                         //if the user has not given a path then, add inputdir. else leave path alone.
 119                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
 120                                 }
 121
 122                                 it = parameters.find("column");
 123                                 //user has given a template file
 124                                 if(it != parameters.end()){
 125                                         path = m->hasPath(it->second);
 126                                         //if the user has not given a path then, add inputdir. else leave path alone.
 127                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
 128                                 }
 129
 130                                 it = parameters.find("name");
 131                                 //user has given a template file
 132                                 if(it != parameters.end()){
 133                                         path = m->hasPath(it->second);
 134                                         //if the user has not given a path then, add inputdir. else leave path alone.
 135                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
 136                                 }
 137
 138                                 it = parameters.find("taxonomy");
 139                                 //user has given a template file
 140                                 if(it != parameters.end()){
 141                                         path = m->hasPath(it->second);
 142                                         //if the user has not given a path then, add inputdir. else leave path alone.
 143                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
 144                                 }
 145
 146                                 it = parameters.find("fasta");
 147                                 //user has given a template file
 148                                 if(it != parameters.end()){
 149                                         path = m->hasPath(it->second);
 150                                         //if the user has not given a path then, add inputdir. else leave path alone.
 151                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
 152                                 }
 153                         }
 154
 155                         //check for required parameters
 156                         phylipfile = validParameter.validFile(parameters, "phylip", true);
 157                         if (phylipfile == "not open") { abort = true; }
 158                         else if (phylipfile == "not found") { phylipfile = ""; }
 159                         else {  distfile = phylipfile;  format = "phylip";      }
 160
 161                         columnfile = validParameter.validFile(parameters, "column", true);
 162                         if (columnfile == "not open") { abort = true; }
 163                         else if (columnfile == "not found") { columnfile = ""; }
 164                         else {  distfile = columnfile; format = "column";       }
 165
 166                         namefile = validParameter.validFile(parameters, "name", true);
 167                         if (namefile == "not open") { abort = true; }
 168                         else if (namefile == "not found") { namefile = ""; }
 169
 170                         fastafile = validParameter.validFile(parameters, "fasta", true);
 171                         if (fastafile == "not open") { abort = true; }
 172                         else if (fastafile == "not found") { fastafile = ""; }
 173                         else { distfile = fastafile;  splitmethod = "fasta";  }
 174
 175                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
 176                         if (taxFile == "not open") { abort = true; }
 177                         else if (taxFile == "not found") { taxFile = ""; }
 178
 179                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
 180                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
 181
 182                         if (columnfile != "") {
 183                                 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
 184                         }
 185
 186                         if (fastafile != "") {
 187                                 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
 188                                 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
 189                         }
 190
 191                         //check for optional parameter and set defaults
 192                         // ...at some point should added some additional type checking...
 193                         //get user cutoff and precision or use defaults
 194                         string temp;
 195                         temp = validParameter.validFile(parameters, "precision", false);
 196                         if (temp == "not found") { temp = "100"; }
 197                         //saves precision legnth for formatting below
 198                         length = temp.length();
 199                         convert(temp, precision);
 200
 201                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
 202                         hard = m->isTrue(temp);
 203
 204                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
 205                         large = m->isTrue(temp);
 206
 207                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
 208                         convert(temp, processors);
 209
 210                         temp = validParameter.validFile(parameters, "splitmethod", false);
 211                         if (splitmethod != "fasta") {
 212                                 if (temp == "not found")  { splitmethod = "distance"; }
 213                                 else {  splitmethod = temp; }
 214                         }
 215
 216                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
 217                         convert(temp, cutoff);
 218                         cutoff += (5 / (precision * 10.0));
 219
 220                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
 221                         convert(temp, taxLevelCutoff);
 222
 223                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
 224
 225                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
 226                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 227
 228                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
 229                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
 230
 231                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
 232
 233                         showabund = validParameter.validFile(parameters, "showabund", false);
 234                         if (showabund == "not found") { showabund = "T"; }
 235
 236                         timing = validParameter.validFile(parameters, "timing", false);
 237                         if (timing == "not found") { timing = "F"; }
 238
 239                 }
 240         }
 241         catch(exception& e) {
 242                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 243                 exit(1);
 244         }
 245 }
 246
 247 //**********************************************************************************************************************
 248
 249 void ClusterSplitCommand::help(){
 250         try {
 251                 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
 252                 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
 253                 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
 254                 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n");
 255                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n");
 256                 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n");
 257                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
 258                 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
 259                 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
 260                 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
 261                 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
 262                 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
 263                 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
 264                 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
 265                 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
 266                 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n");
 267                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
 268                 #ifdef USE_MPI
 269                 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
 270                 #endif
 271                 m->mothurOut("The cluster.split command should be in the following format: \n");
 272                 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
 273                 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
 274
 275         }
 276         catch(exception& e) {
 277                 m->errorOut(e, "ClusterSplitCommand", "help");
 278                 exit(1);
 279         }
 280 }
 281
 282 //**********************************************************************************************************************
 283
 284 ClusterSplitCommand::~ClusterSplitCommand(){}
 285
 286 //**********************************************************************************************************************
 287
 288 int ClusterSplitCommand::execute(){
 289         try {
 290
 291                 if (abort == true) {    return 0;       }
 292
 293                 time_t estart;
 294                 vector<string> listFileNames;
 295                 set<string> labels;
 296                 string singletonName = "";
 297                 double saveCutoff = cutoff;
 298
 299                 //****************** file prep work ******************************//
 300                 #ifdef USE_MPI
 301                         int pid;
 302                         int tag = 2001;
 303                         MPI_Status status;
 304                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
 305                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 306
 307                         if (pid == 0) { //only process 0 converts and splits
 308
 309                 #endif
 310
 311                 //if user gave a phylip file convert to column file
 312                 if (format == "phylip") {
 313                         estart = time(NULL);
 314                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
 315
 316                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
 317
 318                         NameAssignment* nameMap = NULL;
 319                         convert->setFormat("phylip");
 320                         convert->read(nameMap);
 321
 322                         if (m->control_pressed) {  delete convert;  return 0;  }
 323
 324                         distfile = convert->getOutputFile();
 325
 326                         //if no names file given with phylip file, create it
 327                         ListVector* listToMakeNameFile =  convert->getListVector();
 328                         if (namefile == "") {  //you need to make a namefile for split matrix
 329                                 ofstream out;
 330                                 namefile = phylipfile + ".names";
 331                                 m->openOutputFile(namefile, out);
 332                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
 333                                         string bin = listToMakeNameFile->get(i);
 334                                         out << bin << '\t' << bin << endl;
 335                                 }
 336                                 out.close();
 337                         }
 338                         delete listToMakeNameFile;
 339                         delete convert;
 340
 341                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
 342                 }
 343                 if (m->control_pressed) { return 0; }
 344
 345                 estart = time(NULL);
 346                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
 347
 348                 //split matrix into non-overlapping groups
 349                 SplitMatrix* split;
 350                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
 351                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
 352                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir);      }
 353                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
 354
 355                 split->split();
 356
 357                 if (m->control_pressed) { delete split; return 0; }
 358
 359                 singletonName = split->getSingletonNames();
 360                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
 361                 delete split;
 362
 363                 if (m->control_pressed) { return 0; }
 364
 365                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
 366                 estart = time(NULL);
 367
 368                 //****************** break up files between processes and cluster each file set ******************************//
 369         #ifdef USE_MPI
 370                         ////you are process 0 from above////
 371
 372                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 373                         dividedNames.resize(processors);
 374
 375                         //for each file group figure out which process will complete it
 376                         //want to divide the load intelligently so the big files are spread between processes
 377                         int count = 1;
 378                         for (int i = 0; i < distName.size(); i++) {
 379                                 int processToAssign = (i+1) % processors;
 380                                 if (processToAssign == 0) { processToAssign = processors; }
 381
 382                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 383                         }
 384
 385                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 386                         for (int i = 0; i < processors; i++) {
 387                                 int remainder = ((i+1) % processors);
 388                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 389                         }
 390
 391
 392                         //send each child the list of files it needs to process
 393                         for(int i = 1; i < processors; i++) {
 394                                 //send number of file pairs
 395                                 int num = dividedNames[i].size();
 396                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 397
 398                                 for (int j = 0; j < num; j++) { //send filenames to process i
 399                                         char tempDistFileName[1024];
 400                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
 401                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
 402
 403                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 404                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 405
 406                                         char tempNameFileName[1024];
 407                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
 408                                         int lengthName = (dividedNames[i][j].begin()->second).length();
 409
 410                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 411                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 412                                 }
 413                         }
 414
 415                         //process your share
 416                         listFileNames = cluster(dividedNames[0], labels);
 417
 418                         //receive the other processes info
 419                         for(int i = 1; i < processors; i++) {
 420                                 int num = dividedNames[i].size();
 421
 422                                 double tempCutoff;
 423                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
 424                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
 425
 426                                 //send list filenames to root process
 427                                 for (int j = 0; j < num; j++) {
 428                                         int lengthList = 0;
 429                                         char tempListFileName[1024];
 430
 431                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 432                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 433
 434                                         string myListFileName = tempListFileName;
 435                                         myListFileName = myListFileName.substr(0, lengthList);
 436
 437                                         listFileNames.push_back(myListFileName);
 438                                 }
 439
 440                                 //send Labels to root process
 441                                 int numLabels = 0;
 442                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 443
 444                                 for (int j = 0; j < numLabels; j++) {
 445                                         int lengthLabel = 0;
 446                                         char tempLabel[100];
 447
 448                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 449                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 450
 451                                         string myLabel = tempLabel;
 452                                         myLabel = myLabel.substr(0, lengthLabel);
 453
 454                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
 455                                 }
 456                         }
 457
 458                 }else { //you are a child process
 459                         vector < map<string, string> >  myNames;
 460
 461                         //recieve the files you need to process
 462                         //receive number of file pairs
 463                         int num = 0;
 464                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 465
 466                         myNames.resize(num);
 467
 468                         for (int j = 0; j < num; j++) { //receive filenames to process
 469                                 int lengthDist = 0;
 470                                 char tempDistFileName[1024];
 471
 472                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 473                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 474
 475                                 string myDistFileName = tempDistFileName;
 476                                 myDistFileName = myDistFileName.substr(0, lengthDist);
 477
 478                                 int lengthName = 0;
 479                                 char tempNameFileName[1024];
 480
 481                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 482                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 483
 484                                 string myNameFileName = tempNameFileName;
 485                                 myNameFileName = myNameFileName.substr(0, lengthName);
 486
 487                                 //save file name
 488                                 myNames[j][myDistFileName] = myNameFileName;
 489                         }
 490
 491                         //process them
 492                         listFileNames = cluster(myNames, labels);
 493
 494                         //send cutoff
 495                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
 496
 497                         //send list filenames to root process
 498                         for (int j = 0; j < num; j++) {
 499                                 char tempListFileName[1024];
 500                                 strcpy(tempListFileName, listFileNames[j].c_str());
 501                                 int lengthList = listFileNames[j].length();
 502
 503                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 504                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 505                         }
 506
 507                         //send Labels to root process
 508                         int numLabels = labels.size();
 509                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 510
 511                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
 512                                 char tempLabel[100];
 513                                 strcpy(tempLabel, (*it).c_str());
 514                                 int lengthLabel = (*it).length();
 515
 516                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 517                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 518                         }
 519                 }
 520
 521                 //make everyone wait
 522                 MPI_Barrier(MPI_COMM_WORLD);
 523
 524         #else
 525
 526                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 527                                 if(processors == 1){
 528                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 529                                 }else{
 530                                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 531                                         dividedNames.resize(processors);
 532
 533                                         //for each file group figure out which process will complete it
 534                                         //want to divide the load intelligently so the big files are spread between processes
 535                                         int count = 1;
 536                                         for (int i = 0; i < distName.size(); i++) {
 537                                                 int processToAssign = (i+1) % processors;
 538                                                 if (processToAssign == 0) { processToAssign = processors; }
 539
 540                                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 541                                         }
 542
 543                                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 544                                         for (int i = 0; i < processors; i++) {
 545                                                 int remainder = ((i+1) % processors);
 546                                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 547                                         }
 548
 549                                         createProcesses(dividedNames);
 550
 551                                         if (m->control_pressed) { return 0; }
 552
 553                                         //get list of list file names from each process
 554                                         for(int i=0;i<processors;i++){
 555                                                 string filename = toString(processIDS[i]) + ".temp";
 556                                                 ifstream in;
 557                                                 m->openInputFile(filename, in);
 558
 559                                                 in >> tag; m->gobble(in);
 560
 561                                                 while(!in.eof()) {
 562                                                         string tempName;
 563                                                         in >> tempName; m->gobble(in);
 564                                                         listFileNames.push_back(tempName);
 565                                                 }
 566                                                 in.close();
 567                                                 remove((toString(processIDS[i]) + ".temp").c_str());
 568
 569                                                 //get labels
 570                                                 filename = toString(processIDS[i]) + ".temp.labels";
 571                                                 ifstream in2;
 572                                                 m->openInputFile(filename, in2);
 573
 574                                                 float tempCutoff;
 575                                                 in2 >> tempCutoff; m->gobble(in2);
 576                                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
 577
 578                                                 while(!in2.eof()) {
 579                                                         string tempName;
 580                                                         in2 >> tempName; m->gobble(in2);
 581                                                         if (labels.count(tempName) == 0) { labels.insert(tempName); }
 582                                                 }
 583                                                 in2.close();
 584                                                 remove((toString(processIDS[i]) + ".temp.labels").c_str());
 585                                         }
 586                                 }
 587                 #else
 588                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 589                 #endif
 590         #endif
 591                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
 592
 593                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
 594
 595                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
 596
 597                 //****************** merge list file and create rabund and sabund files ******************************//
 598                 estart = time(NULL);
 599                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
 600
 601                 #ifdef USE_MPI
 602                         if (pid == 0) { //only process 0 merges
 603                 #endif
 604
 605                 ListVector* listSingle;
 606                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 607
 608                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
 609
 610                 mergeLists(listFileNames, labelBins, listSingle);
 611
 612                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
 613
 614                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 615
 616                 m->mothurOutEndLine();
 617                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 618                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
 619                 m->mothurOutEndLine();
 620
 621                 #ifdef USE_MPI
 622                         } //only process 0 merges
 623
 624                         //make everyone wait
 625                         MPI_Barrier(MPI_COMM_WORLD);
 626                 #endif
 627
 628                 return 0;
 629         }
 630         catch(exception& e) {
 631                 m->errorOut(e, "ClusterSplitCommand", "execute");
 632                 exit(1);
 633         }
 634 }
 635 //**********************************************************************************************************************
 636 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
 637         try {
 638
 639                 map<float, int> labelBin;
 640                 vector<float> orderFloat;
 641                 int numSingleBins;
 642
 643                 //read in singletons
 644                 if (singleton != "none") {
 645                         ifstream in;
 646                         m->openInputFile(singleton, in);
 647
 648                         string firstCol, secondCol;
 649                         listSingle = new ListVector();
 650                         while (!in.eof()) {
 651                                 in >> firstCol >> secondCol; m->gobble(in);
 652                                 listSingle->push_back(secondCol);
 653                         }
 654                         in.close();
 655                         remove(singleton.c_str());
 656
 657                         numSingleBins = listSingle->getNumBins();
 658                 }else{  listSingle = NULL; numSingleBins = 0;  }
 659
 660                 //go through users set and make them floats so we can sort them
 661                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
 662                         float temp = -10.0;
 663
 664                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
 665                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
 666
 667                         if (temp <= cutoff) {
 668                                 orderFloat.push_back(temp);
 669                                 labelBin[temp] = numSingleBins; //initialize numbins
 670                         }
 671                 }
 672
 673                 //sort order
 674                 sort(orderFloat.begin(), orderFloat.end());
 675                 userLabels.clear();
 676
 677                 //get the list info from each file
 678                 for (int k = 0; k < listNames.size(); k++) {
 679
 680                         if (m->control_pressed) {
 681                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
 682                                 for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
 683                                 return labelBin;
 684                         }
 685
 686                         InputData* input = new InputData(listNames[k], "list");
 687                         ListVector* list = input->getListVector();
 688                         string lastLabel = list->getLabel();
 689
 690                         string filledInList = listNames[k] + "filledInTemp";
 691                         ofstream outFilled;
 692                         m->openOutputFile(filledInList, outFilled);
 693
 694                         //for each label needed
 695                         for(int l = 0; l < orderFloat.size(); l++){
 696
 697                                 string thisLabel;
 698                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
 699                                 else { thisLabel = toString(orderFloat[l],  length-1);  }
 700
 701                                 //this file has reached the end
 702                                 if (list == NULL) {
 703                                         list = input->getListVector(lastLabel, true);
 704                                 }else{  //do you have the distance, or do you need to fill in
 705
 706                                         float labelFloat;
 707                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
 708                                         else { convert(list->getLabel(), labelFloat); }
 709
 710                                         //check for missing labels
 711                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
 712                                                 //if its bigger get last label, otherwise keep it
 713                                                 delete list;
 714                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
 715                                         }
 716                                         lastLabel = list->getLabel();
 717                                 }
 718
 719                                 //print to new file
 720                                 list->setLabel(thisLabel);
 721                                 list->print(outFilled);
 722
 723                                 //update labelBin
 724                                 labelBin[orderFloat[l]] += list->getNumBins();
 725
 726                                 delete list;
 727
 728                                 list = input->getListVector();
 729                         }
 730
 731                         if (list != NULL) { delete list; }
 732                         delete input;
 733
 734                         outFilled.close();
 735                         remove(listNames[k].c_str());
 736                         rename(filledInList.c_str(), listNames[k].c_str());
 737                 }
 738
 739                 return labelBin;
 740         }
 741         catch(exception& e) {
 742                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
 743                 exit(1);
 744         }
 745 }
 746 //**********************************************************************************************************************
 747 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
 748         try {
 749                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
 750                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
 751
 752                 m->openOutputFile(fileroot+ tag + ".sabund",    outSabund);
 753                 m->openOutputFile(fileroot+ tag + ".rabund",    outRabund);
 754                 m->openOutputFile(fileroot+ tag + ".list",              outList);
 755
 756                 outputNames.push_back(fileroot+ tag + ".sabund");  outputTypes["list"].push_back(fileroot+ tag + ".list");
 757                 outputNames.push_back(fileroot+ tag + ".rabund");  outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
 758                 outputNames.push_back(fileroot+ tag + ".list");    outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
 759
 760                 map<float, int>::iterator itLabel;
 761
 762                 //for each label needed
 763                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
 764
 765                         string thisLabel;
 766                         if (itLabel->first == -1) { thisLabel = "unique"; }
 767                         else { thisLabel = toString(itLabel->first,  length-1);  }
 768
 769                         outList << thisLabel << '\t' << itLabel->second << '\t';
 770
 771                         RAbundVector* rabund = new RAbundVector();
 772                         rabund->setLabel(thisLabel);
 773
 774                         //add in singletons
 775                         if (listSingle != NULL) {
 776                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
 777                                         outList << listSingle->get(j) << '\t';
 778                                         rabund->push_back(m->getNumNames(listSingle->get(j)));
 779                                 }
 780                         }
 781
 782                         //get the list info from each file
 783                         for (int k = 0; k < listNames.size(); k++) {
 784
 785                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
 786
 787                                 InputData* input = new InputData(listNames[k], "list");
 788                                 ListVector* list = input->getListVector(thisLabel);
 789
 790                                 //this file has reached the end
 791                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }
 792                                 else {
 793                                         for (int j = 0; j < list->getNumBins(); j++) {
 794                                                 outList << list->get(j) << '\t';
 795                                                 rabund->push_back(m->getNumNames(list->get(j)));
 796                                         }
 797                                         delete list;
 798                                 }
 799                                 delete input;
 800                         }
 801
 802                         SAbundVector sabund = rabund->getSAbundVector();
 803
 804                         sabund.print(outSabund);
 805                         rabund->print(outRabund);
 806                         outList << endl;
 807
 808                         delete rabund;
 809                 }
 810
 811                 outList.close();
 812                 outRabund.close();
 813                 outSabund.close();
 814
 815                 if (listSingle != NULL) { delete listSingle;  }
 816
 817                 for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
 818
 819                 return 0;
 820         }
 821         catch(exception& e) {
 822                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
 823                 exit(1);
 824         }
 825 }
 826
 827 //**********************************************************************************************************************
 828
 829 void ClusterSplitCommand::printData(ListVector* oldList){
 830         try {
 831                 string label = oldList->getLabel();
 832                 RAbundVector oldRAbund = oldList->getRAbundVector();
 833
 834                 oldRAbund.setLabel(label);
 835                 if (m->isTrue(showabund)) {
 836                         oldRAbund.getSAbundVector().print(cout);
 837                 }
 838                 oldRAbund.print(outRabund);
 839                 oldRAbund.getSAbundVector().print(outSabund);
 840
 841                 oldList->print(outList);
 842         }
 843         catch(exception& e) {
 844                 m->errorOut(e, "ClusterSplitCommand", "printData");
 845                 exit(1);
 846         }
 847 }
 848 //**********************************************************************************************************************
 849 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
 850         try {
 851
 852         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 853                 int process = 0;
 854                 int exitCommand = 1;
 855                 processIDS.clear();
 856
 857                 //loop through and create all the processes you want
 858                 while (process != processors) {
 859                         int pid = fork();
 860
 861                         if (pid > 0) {
 862                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 863                                 process++;
 864                         }else if (pid == 0){
 865                                 set<string> labels;
 866                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
 867
 868                                 //write out names to file
 869                                 string filename = toString(getpid()) + ".temp";
 870                                 ofstream out;
 871                                 m->openOutputFile(filename, out);
 872                                 out << tag << endl;
 873                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
 874                                 out.close();
 875
 876                                 //print out labels
 877                                 ofstream outLabels;
 878                                 filename = toString(getpid()) + ".temp.labels";
 879                                 m->openOutputFile(filename, outLabels);
 880
 881                                 outLabels << cutoff << endl;
 882                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
 883                                         outLabels << (*it) << endl;
 884                                 }
 885                                 outLabels.close();
 886
 887                                 exit(0);
 888                         }else {
 889                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
 890                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
 891                                 exit(0);
 892                         }
 893                 }
 894
 895                 //force parent to wait until all the processes are done
 896                 for (int i=0;i<processors;i++) {
 897                         int temp = processIDS[i];
 898                         wait(&temp);
 899                 }
 900
 901                 return exitCommand;
 902         #endif
 903
 904         }
 905         catch(exception& e) {
 906                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
 907                 exit(1);
 908         }
 909 }
 910 //**********************************************************************************************************************
 911
 912 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
 913         try {
 914                 Cluster* cluster;
 915                 SparseMatrix* matrix;
 916                 ListVector* list;
 917                 ListVector oldList;
 918                 RAbundVector* rabund;
 919
 920                 vector<string> listFileNames;
 921
 922                 double smallestCutoff = cutoff;
 923
 924                 //cluster each distance file
 925                 for (int i = 0; i < distNames.size(); i++) {
 926                         if (m->control_pressed) { return listFileNames; }
 927
 928                         string thisNamefile = distNames[i].begin()->second;
 929                         string thisDistFile = distNames[i].begin()->first;
 930
 931                         //read in distance file
 932                         globaldata->setNameFile(thisNamefile);
 933                         globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
 934
 935                         #ifdef USE_MPI
 936                                 int pid;
 937                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 938
 939                                 //output your files too
 940                                 if (pid != 0) {
 941                                         cout << endl << "Reading " << thisDistFile << endl;
 942                                 }
 943                         #endif
 944
 945                         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
 946
 947                         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
 948                         read->setCutoff(cutoff);
 949
 950                         NameAssignment* nameMap = new NameAssignment(thisNamefile);
 951                         nameMap->readMap();
 952                         read->read(nameMap);
 953
 954                         if (m->control_pressed) {  delete read; delete nameMap; return listFileNames; }
 955
 956                         list = read->getListVector();
 957                         oldList = *list;
 958                         matrix = read->getMatrix();
 959
 960                         delete read;
 961                         delete nameMap;
 962
 963
 964                         #ifdef USE_MPI
 965                                 //output your files too
 966                                 if (pid != 0) {
 967                                         cout << endl << "Clustering " << thisDistFile << endl;
 968                                 }
 969                         #endif
 970
 971                         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
 972
 973                         rabund = new RAbundVector(list->getRAbundVector());
 974
 975                         //create cluster
 976                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
 977                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
 978                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
 979                         tag = cluster->getTag();
 980
 981                         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
 982                         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
 983
 984                         ofstream listFile;
 985                         m->openOutputFile(fileroot+ tag + ".list",      listFile);
 986
 987                         listFileNames.push_back(fileroot+ tag + ".list");
 988
 989                         time_t estart = time(NULL);
 990
 991                         float previousDist = 0.00000;
 992                         float rndPreviousDist = 0.00000;
 993
 994                         oldList = *list;
 995
 996                         print_start = true;
 997                         start = time(NULL);
 998                         double saveCutoff = cutoff;
 999
1000                         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1001
1002                                 if (m->control_pressed) { //clean up
1003                                         delete matrix; delete list;     delete cluster; delete rabund;
1004                                         listFile.close();
1005                                         for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
1006                                         listFileNames.clear(); return listFileNames;
1007                                 }
1008
1009                                 cluster->update(saveCutoff);
1010
1011                                 float dist = matrix->getSmallDist();
1012                                 float rndDist;
1013                                 if (hard) {
1014                                         rndDist = m->ceilDist(dist, precision);
1015                                 }else{
1016                                         rndDist = m->roundDist(dist, precision);
1017                                 }
1018
1019                                 if(previousDist <= 0.0000 && dist != previousDist){
1020                                         oldList.setLabel("unique");
1021                                         oldList.print(listFile);
1022                                         if (labels.count("unique") == 0) {  labels.insert("unique");  }
1023                                 }
1024                                 else if(rndDist != rndPreviousDist){
1025                                         oldList.setLabel(toString(rndPreviousDist,  length-1));
1026                                         oldList.print(listFile);
1027                                         if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1028                                 }
1029
1030                                 previousDist = dist;
1031                                 rndPreviousDist = rndDist;
1032                                 oldList = *list;
1033                         }
1034
1035
1036                         if(previousDist <= 0.0000){
1037                                 oldList.setLabel("unique");
1038                                 oldList.print(listFile);
1039                                 if (labels.count("unique") == 0) { labels.insert("unique"); }
1040                         }
1041                         else if(rndPreviousDist<cutoff){
1042                                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1043                                 oldList.print(listFile);
1044                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1045                         }
1046
1047                         delete matrix; delete list;     delete cluster; delete rabund;
1048                         listFile.close();
1049
1050                         if (m->control_pressed) { //clean up
1051                                 for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
1052                                 listFileNames.clear(); return listFileNames;
1053                         }
1054
1055                         remove(thisDistFile.c_str());
1056                         remove(thisNamefile.c_str());
1057
1058                         if (saveCutoff != cutoff) {
1059                                 if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1060                                 else            {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1061
1062                                 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1063                         }
1064
1065                         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1066                 }
1067
1068                 cutoff = smallestCutoff;
1069
1070                 return listFileNames;
1071
1072         }
1073         catch(exception& e) {
1074                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1075                 exit(1);
1076         }
1077
1078
1079 }
1080
1081 //**********************************************************************************************************************