clustersplitcommand.cpp

   1 /*
   2  *  clustersplitcommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 5/19/10.
   6  *  Copyright 2010 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "clustersplitcommand.h"
  11 #include "readcluster.h"
  12 #include "splitmatrix.h"
  13 #include "readphylip.h"
  14 #include "readcolumn.h"
  15 #include "readmatrix.hpp"
  16 #include "inputdata.h"
  17
  18 //**********************************************************************************************************************
  19 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
  20 ClusterSplitCommand::ClusterSplitCommand(string option)  {
  21         try{
  22                 globaldata = GlobalData::getInstance();
  23                 abort = false;
  24                 format = "";
  25
  26                 //allow user to run help
  27                 if(option == "help") { help(); abort = true; }
  28
  29                 else {
  30                         //valid paramters for this command
  31                         string Array[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
  32                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
  33
  34                         OptionParser parser(option);
  35                         map<string,string> parameters = parser.getParameters();
  36
  37                         ValidParameters validParameter;
  38
  39                         //check to make sure all parameters are valid for command
  40                         map<string,string>::iterator it;
  41                         for (it = parameters.begin(); it != parameters.end(); it++) {
  42                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
  43                                         abort = true;
  44                                 }
  45                         }
  46
  47                         globaldata->newRead();
  48
  49                         //if the user changes the output directory command factory will send this info to us in the output parameter
  50                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
  51
  52                                 //if the user changes the input directory command factory will send this info to us in the output parameter
  53                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
  54                         if (inputDir == "not found"){   inputDir = "";          }
  55                         else {
  56                                 string path;
  57                                 it = parameters.find("phylip");
  58                                 //user has given a template file
  59                                 if(it != parameters.end()){
  60                                         path = hasPath(it->second);
  61                                         //if the user has not given a path then, add inputdir. else leave path alone.
  62                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
  63                                 }
  64
  65                                 it = parameters.find("column");
  66                                 //user has given a template file
  67                                 if(it != parameters.end()){
  68                                         path = hasPath(it->second);
  69                                         //if the user has not given a path then, add inputdir. else leave path alone.
  70                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
  71                                 }
  72
  73                                 it = parameters.find("name");
  74                                 //user has given a template file
  75                                 if(it != parameters.end()){
  76                                         path = hasPath(it->second);
  77                                         //if the user has not given a path then, add inputdir. else leave path alone.
  78                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
  79                                 }
  80
  81                                 it = parameters.find("taxonomy");
  82                                 //user has given a template file
  83                                 if(it != parameters.end()){
  84                                         path = hasPath(it->second);
  85                                         //if the user has not given a path then, add inputdir. else leave path alone.
  86                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
  87                                 }
  88
  89                                 it = parameters.find("fasta");
  90                                 //user has given a template file
  91                                 if(it != parameters.end()){
  92                                         path = hasPath(it->second);
  93                                         //if the user has not given a path then, add inputdir. else leave path alone.
  94                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
  95                                 }
  96                         }
  97
  98                         //check for required parameters
  99                         phylipfile = validParameter.validFile(parameters, "phylip", true);
 100                         if (phylipfile == "not open") { abort = true; }
 101                         else if (phylipfile == "not found") { phylipfile = ""; }
 102                         else {  distfile = phylipfile;  format = "phylip";      }
 103
 104                         columnfile = validParameter.validFile(parameters, "column", true);
 105                         if (columnfile == "not open") { abort = true; }
 106                         else if (columnfile == "not found") { columnfile = ""; }
 107                         else {  distfile = columnfile; format = "column";       }
 108
 109                         namefile = validParameter.validFile(parameters, "name", true);
 110                         if (namefile == "not open") { abort = true; }
 111                         else if (namefile == "not found") { namefile = ""; }
 112
 113                         fastafile = validParameter.validFile(parameters, "fasta", true);
 114                         if (fastafile == "not open") { abort = true; }
 115                         else if (fastafile == "not found") { fastafile = ""; }
 116                         else { distfile = fastafile;  splitmethod = "fasta";  }
 117
 118                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
 119                         if (taxFile == "not open") { abort = true; }
 120                         else if (taxFile == "not found") { taxFile = ""; }
 121
 122                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
 123                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
 124
 125                         if (columnfile != "") {
 126                                 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
 127                         }
 128
 129                         if (fastafile != "") {
 130                                 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
 131                                 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
 132                         }
 133
 134                         //check for optional parameter and set defaults
 135                         // ...at some point should added some additional type checking...
 136                         //get user cutoff and precision or use defaults
 137                         string temp;
 138                         temp = validParameter.validFile(parameters, "precision", false);
 139                         if (temp == "not found") { temp = "100"; }
 140                         //saves precision legnth for formatting below
 141                         length = temp.length();
 142                         convert(temp, precision);
 143
 144                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
 145                         hard = isTrue(temp);
 146
 147                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
 148                         large = isTrue(temp);
 149
 150                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
 151                         convert(temp, processors);
 152
 153                         temp = validParameter.validFile(parameters, "splitmethod", false);
 154                         if (splitmethod != "fasta") {
 155                                 if (temp == "not found")  { splitmethod = "distance"; }
 156                                 else {  splitmethod = temp; }
 157                         }
 158
 159                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
 160                         convert(temp, cutoff);
 161                         cutoff += (5 / (precision * 10.0));
 162
 163                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
 164                         convert(temp, taxLevelCutoff);
 165
 166                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
 167
 168                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
 169                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 170
 171                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
 172                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
 173
 174                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
 175
 176                         showabund = validParameter.validFile(parameters, "showabund", false);
 177                         if (showabund == "not found") { showabund = "T"; }
 178
 179                         timing = validParameter.validFile(parameters, "timing", false);
 180                         if (timing == "not found") { timing = "F"; }
 181
 182                 }
 183         }
 184         catch(exception& e) {
 185                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 186                 exit(1);
 187         }
 188 }
 189
 190 //**********************************************************************************************************************
 191
 192 void ClusterSplitCommand::help(){
 193         try {
 194                 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
 195                 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
 196                 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
 197                 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n");
 198                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and split the distance file based on those groups. \n");
 199                 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n");
 200                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
 201                 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
 202                 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
 203                 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
 204                 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
 205                 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
 206                 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
 207                 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
 208                 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
 209                 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n");
 210                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
 211                 #ifdef USE_MPI
 212                 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
 213                 #endif
 214                 m->mothurOut("The cluster.split command should be in the following format: \n");
 215                 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
 216                 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
 217
 218         }
 219         catch(exception& e) {
 220                 m->errorOut(e, "ClusterSplitCommand", "help");
 221                 exit(1);
 222         }
 223 }
 224
 225 //**********************************************************************************************************************
 226
 227 ClusterSplitCommand::~ClusterSplitCommand(){}
 228
 229 //**********************************************************************************************************************
 230
 231 int ClusterSplitCommand::execute(){
 232         try {
 233
 234                 if (abort == true) {    return 0;       }
 235
 236                 time_t estart;
 237                 vector<string> listFileNames;
 238                 set<string> labels;
 239                 string singletonName = "";
 240
 241                 //****************** file prep work ******************************//
 242                 #ifdef USE_MPI
 243                         int pid;
 244                         int tag = 2001;
 245                         MPI_Status status;
 246                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
 247                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 248
 249                         if (pid == 0) { //only process 0 converts and splits
 250
 251                 #endif
 252
 253                 //if user gave a phylip file convert to column file
 254                 if (format == "phylip") {
 255                         estart = time(NULL);
 256                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
 257
 258                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
 259
 260                         NameAssignment* nameMap = NULL;
 261                         convert->setFormat("phylip");
 262                         convert->read(nameMap);
 263
 264                         if (m->control_pressed) {  delete convert;  return 0;  }
 265
 266                         distfile = convert->getOutputFile();
 267
 268                         //if no names file given with phylip file, create it
 269                         ListVector* listToMakeNameFile =  convert->getListVector();
 270                         if (namefile == "") {  //you need to make a namefile for split matrix
 271                                 ofstream out;
 272                                 namefile = phylipfile + ".names";
 273                                 openOutputFile(namefile, out);
 274                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
 275                                         string bin = listToMakeNameFile->get(i);
 276                                         out << bin << '\t' << bin << endl;
 277                                 }
 278                                 out.close();
 279                         }
 280                         delete listToMakeNameFile;
 281                         delete convert;
 282
 283                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
 284                 }
 285                 if (m->control_pressed) { return 0; }
 286
 287                 estart = time(NULL);
 288                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
 289
 290                 //split matrix into non-overlapping groups
 291                 SplitMatrix* split;
 292                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
 293                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
 294                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, splitmethod, processors, outputDir);      }
 295                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
 296
 297                 split->split();
 298
 299                 if (m->control_pressed) { delete split; return 0; }
 300
 301                 singletonName = split->getSingletonNames();
 302                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
 303                 delete split;
 304
 305                 if (m->control_pressed) { return 0; }
 306
 307                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
 308                 estart = time(NULL);
 309
 310                 //****************** break up files between processes and cluster each file set ******************************//
 311         #ifdef USE_MPI
 312                         ////you are process 0 from above////
 313
 314                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 315                         dividedNames.resize(processors);
 316
 317                         //for each file group figure out which process will complete it
 318                         //want to divide the load intelligently so the big files are spread between processes
 319                         int count = 1;
 320                         for (int i = 0; i < distName.size(); i++) {
 321                                 int processToAssign = (i+1) % processors;
 322                                 if (processToAssign == 0) { processToAssign = processors; }
 323
 324                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 325                         }
 326
 327                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 328                         for (int i = 0; i < processors; i++) {
 329                                 int remainder = ((i+1) % processors);
 330                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 331                         }
 332
 333
 334                         //send each child the list of files it needs to process
 335                         for(int i = 1; i < processors; i++) {
 336                                 //send number of file pairs
 337                                 int num = dividedNames[i].size();
 338                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 339
 340                                 for (int j = 0; j < num; j++) { //send filenames to process i
 341                                         char tempDistFileName[1024];
 342                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
 343                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
 344
 345                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 346                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 347
 348                                         char tempNameFileName[1024];
 349                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
 350                                         int lengthName = (dividedNames[i][j].begin()->second).length();
 351
 352                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 353                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 354                                 }
 355                         }
 356
 357                         //process your share
 358                         listFileNames = cluster(dividedNames[0], labels);
 359
 360                         //receive the other processes info
 361                         for(int i = 1; i < processors; i++) {
 362                                 int num = dividedNames[i].size();
 363
 364                                 //send list filenames to root process
 365                                 for (int j = 0; j < num; j++) {
 366                                         int lengthList = 0;
 367                                         char tempListFileName[1024];
 368
 369                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 370                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 371
 372                                         string myListFileName = tempListFileName;
 373                                         myListFileName = myListFileName.substr(0, lengthList);
 374
 375                                         listFileNames.push_back(myListFileName);
 376                                 }
 377
 378                                 //send Labels to root process
 379                                 int numLabels = 0;
 380                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 381
 382                                 for (int j = 0; j < numLabels; j++) {
 383                                         int lengthLabel = 0;
 384                                         char tempLabel[100];
 385
 386                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 387                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 388
 389                                         string myLabel = tempLabel;
 390                                         myLabel = myLabel.substr(0, lengthLabel);
 391
 392                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
 393                                 }
 394                         }
 395
 396                 }else { //you are a child process
 397                         vector < map<string, string> >  myNames;
 398
 399                         //recieve the files you need to process
 400                         //receive number of file pairs
 401                         int num = 0;
 402                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 403
 404                         myNames.resize(num);
 405
 406                         for (int j = 0; j < num; j++) { //receive filenames to process
 407                                 int lengthDist = 0;
 408                                 char tempDistFileName[1024];
 409
 410                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 411                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 412
 413                                 string myDistFileName = tempDistFileName;
 414                                 myDistFileName = myDistFileName.substr(0, lengthDist);
 415
 416                                 int lengthName = 0;
 417                                 char tempNameFileName[1024];
 418
 419                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 420                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 421
 422                                 string myNameFileName = tempNameFileName;
 423                                 myNameFileName = myNameFileName.substr(0, lengthName);
 424
 425                                 //save file name
 426                                 myNames[j][myDistFileName] = myNameFileName;
 427                         }
 428
 429                         //process them
 430                         listFileNames = cluster(myNames, labels);
 431
 432                         //send list filenames to root process
 433                         for (int j = 0; j < num; j++) {
 434                                 char tempListFileName[1024];
 435                                 strcpy(tempListFileName, listFileNames[j].c_str());
 436                                 int lengthList = listFileNames[j].length();
 437
 438                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 439                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 440                         }
 441
 442                         //send Labels to root process
 443                         int numLabels = labels.size();
 444                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 445
 446                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
 447                                 char tempLabel[100];
 448                                 strcpy(tempLabel, (*it).c_str());
 449                                 int lengthLabel = (*it).length();
 450
 451                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 452                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 453                         }
 454                 }
 455
 456                 //make everyone wait
 457                 MPI_Barrier(MPI_COMM_WORLD);
 458
 459         #else
 460
 461                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 462                                 if(processors == 1){
 463                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 464                                 }else{
 465                                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 466                                         dividedNames.resize(processors);
 467
 468                                         //for each file group figure out which process will complete it
 469                                         //want to divide the load intelligently so the big files are spread between processes
 470                                         int count = 1;
 471                                         for (int i = 0; i < distName.size(); i++) {
 472                                                 int processToAssign = (i+1) % processors;
 473                                                 if (processToAssign == 0) { processToAssign = processors; }
 474
 475                                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 476                                         }
 477
 478                                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 479                                         for (int i = 0; i < processors; i++) {
 480                                                 int remainder = ((i+1) % processors);
 481                                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 482                                         }
 483
 484                                         createProcesses(dividedNames);
 485
 486                                         if (m->control_pressed) { return 0; }
 487
 488                                         //get list of list file names from each process
 489                                         for(int i=0;i<processors;i++){
 490                                                 string filename = toString(processIDS[i]) + ".temp";
 491                                                 ifstream in;
 492                                                 openInputFile(filename, in);
 493
 494                                                 in >> tag; gobble(in);
 495
 496                                                 while(!in.eof()) {
 497                                                         string tempName;
 498                                                         in >> tempName; gobble(in);
 499                                                         listFileNames.push_back(tempName);
 500                                                 }
 501                                                 in.close();
 502                                                 remove((toString(processIDS[i]) + ".temp").c_str());
 503
 504                                                 //get labels
 505                                                 filename = toString(processIDS[i]) + ".temp.labels";
 506                                                 ifstream in2;
 507                                                 openInputFile(filename, in2);
 508
 509                                                 while(!in2.eof()) {
 510                                                         string tempName;
 511                                                         in2 >> tempName; gobble(in2);
 512                                                         if (labels.count(tempName) == 0) { labels.insert(tempName); }
 513                                                 }
 514                                                 in2.close();
 515                                                 remove((toString(processIDS[i]) + ".temp.labels").c_str());
 516                                         }
 517                                 }
 518                 #else
 519                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 520                 #endif
 521         #endif
 522                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
 523
 524                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
 525
 526                 //****************** merge list file and create rabund and sabund files ******************************//
 527                 estart = time(NULL);
 528                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
 529
 530                 #ifdef USE_MPI
 531                         if (pid == 0) { //only process 0 merges
 532                 #endif
 533
 534                 ListVector* listSingle;
 535                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 536
 537                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
 538
 539                 mergeLists(listFileNames, labelBins, listSingle);
 540
 541                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
 542
 543                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 544
 545                 m->mothurOutEndLine();
 546                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 547                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
 548                 m->mothurOutEndLine();
 549
 550                 #ifdef USE_MPI
 551                         } //only process 0 merges
 552
 553                         //make everyone wait
 554                         MPI_Barrier(MPI_COMM_WORLD);
 555                 #endif
 556
 557                 return 0;
 558         }
 559         catch(exception& e) {
 560                 m->errorOut(e, "ClusterSplitCommand", "execute");
 561                 exit(1);
 562         }
 563 }
 564 //**********************************************************************************************************************
 565 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
 566         try {
 567
 568                 map<float, int> labelBin;
 569                 vector<float> orderFloat;
 570                 int numSingleBins;
 571
 572                 //read in singletons
 573                 if (singleton != "none") {
 574                         ifstream in;
 575                         openInputFile(singleton, in);
 576
 577                         string firstCol, secondCol;
 578                         listSingle = new ListVector();
 579                         while (!in.eof()) {
 580                                 in >> firstCol >> secondCol; gobble(in);
 581                                 listSingle->push_back(secondCol);
 582                         }
 583                         in.close();
 584                         remove(singleton.c_str());
 585
 586                         numSingleBins = listSingle->getNumBins();
 587                 }else{  listSingle = NULL; numSingleBins = 0;  }
 588
 589                 //go through users set and make them floats so we can sort them
 590                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
 591                         float temp = -10.0;
 592
 593                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
 594                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
 595
 596                         orderFloat.push_back(temp);
 597                         labelBin[temp] = numSingleBins; //initialize numbins
 598                 }
 599
 600                 //sort order
 601                 sort(orderFloat.begin(), orderFloat.end());
 602                 userLabels.clear();
 603
 604                 //get the list info from each file
 605                 for (int k = 0; k < listNames.size(); k++) {
 606
 607                         if (m->control_pressed) {
 608                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
 609                                 for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
 610                                 return labelBin;
 611                         }
 612
 613                         InputData* input = new InputData(listNames[k], "list");
 614                         ListVector* list = input->getListVector();
 615                         string lastLabel = list->getLabel();
 616
 617                         string filledInList = listNames[k] + "filledInTemp";
 618                         ofstream outFilled;
 619                         openOutputFile(filledInList, outFilled);
 620
 621                         //for each label needed
 622                         for(int l = 0; l < orderFloat.size(); l++){
 623
 624                                 string thisLabel;
 625                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
 626                                 else { thisLabel = toString(orderFloat[l],  length-1);  }
 627
 628                                 //this file has reached the end
 629                                 if (list == NULL) {
 630                                         list = input->getListVector(lastLabel, true);
 631                                 }else{  //do you have the distance, or do you need to fill in
 632
 633                                         float labelFloat;
 634                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
 635                                         else { convert(list->getLabel(), labelFloat); }
 636
 637                                         //check for missing labels
 638                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
 639                                                 //if its bigger get last label, otherwise keep it
 640                                                 delete list;
 641                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
 642                                         }
 643                                         lastLabel = list->getLabel();
 644                                 }
 645
 646                                 //print to new file
 647                                 list->setLabel(thisLabel);
 648                                 list->print(outFilled);
 649
 650                                 //update labelBin
 651                                 labelBin[orderFloat[l]] += list->getNumBins();
 652
 653                                 delete list;
 654
 655                                 list = input->getListVector();
 656                         }
 657
 658                         if (list != NULL) { delete list; }
 659                         delete input;
 660
 661                         outFilled.close();
 662                         remove(listNames[k].c_str());
 663                         rename(filledInList.c_str(), listNames[k].c_str());
 664                 }
 665
 666                 return labelBin;
 667         }
 668         catch(exception& e) {
 669                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
 670                 exit(1);
 671         }
 672 }
 673 //**********************************************************************************************************************
 674 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
 675         try {
 676                 if (outputDir == "") { outputDir += hasPath(distfile); }
 677                 fileroot = outputDir + getRootName(getSimpleName(distfile));
 678
 679                 openOutputFile(fileroot+ tag + ".sabund",       outSabund);
 680                 openOutputFile(fileroot+ tag + ".rabund",       outRabund);
 681                 openOutputFile(fileroot+ tag + ".list",         outList);
 682
 683                 outputNames.push_back(fileroot+ tag + ".sabund");
 684                 outputNames.push_back(fileroot+ tag + ".rabund");
 685                 outputNames.push_back(fileroot+ tag + ".list");
 686
 687                 map<float, int>::iterator itLabel;
 688
 689                 //for each label needed
 690                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
 691
 692                         string thisLabel;
 693                         if (itLabel->first == -1) { thisLabel = "unique"; }
 694                         else { thisLabel = toString(itLabel->first,  length-1);  }
 695
 696                         outList << thisLabel << '\t' << itLabel->second << '\t';
 697
 698                         RAbundVector* rabund = new RAbundVector();
 699                         rabund->setLabel(thisLabel);
 700
 701                         //add in singletons
 702                         if (listSingle != NULL) {
 703                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
 704                                         outList << listSingle->get(j) << '\t';
 705                                         rabund->push_back(getNumNames(listSingle->get(j)));
 706                                 }
 707                         }
 708
 709                         //get the list info from each file
 710                         for (int k = 0; k < listNames.size(); k++) {
 711
 712                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
 713
 714                                 InputData* input = new InputData(listNames[k], "list");
 715                                 ListVector* list = input->getListVector(thisLabel);
 716
 717                                 //this file has reached the end
 718                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }
 719                                 else {
 720                                         for (int j = 0; j < list->getNumBins(); j++) {
 721                                                 outList << list->get(j) << '\t';
 722                                                 rabund->push_back(getNumNames(list->get(j)));
 723                                         }
 724                                         delete list;
 725                                 }
 726                                 delete input;
 727                         }
 728
 729                         SAbundVector sabund = rabund->getSAbundVector();
 730
 731                         sabund.print(outSabund);
 732                         rabund->print(outRabund);
 733                         outList << endl;
 734
 735                         delete rabund;
 736                 }
 737
 738                 outList.close();
 739                 outRabund.close();
 740                 outSabund.close();
 741
 742                 if (listSingle != NULL) { delete listSingle;  }
 743
 744                 for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
 745
 746                 return 0;
 747         }
 748         catch(exception& e) {
 749                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
 750                 exit(1);
 751         }
 752 }
 753
 754 //**********************************************************************************************************************
 755
 756 void ClusterSplitCommand::printData(ListVector* oldList){
 757         try {
 758                 string label = oldList->getLabel();
 759                 RAbundVector oldRAbund = oldList->getRAbundVector();
 760
 761                 oldRAbund.setLabel(label);
 762                 if (isTrue(showabund)) {
 763                         oldRAbund.getSAbundVector().print(cout);
 764                 }
 765                 oldRAbund.print(outRabund);
 766                 oldRAbund.getSAbundVector().print(outSabund);
 767
 768                 oldList->print(outList);
 769         }
 770         catch(exception& e) {
 771                 m->errorOut(e, "ClusterSplitCommand", "printData");
 772                 exit(1);
 773         }
 774 }
 775 //**********************************************************************************************************************
 776 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
 777         try {
 778
 779         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 780                 int process = 0;
 781                 int exitCommand = 1;
 782                 processIDS.clear();
 783
 784                 //loop through and create all the processes you want
 785                 while (process != processors) {
 786                         int pid = fork();
 787
 788                         if (pid > 0) {
 789                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 790                                 process++;
 791                         }else if (pid == 0){
 792                                 set<string> labels;
 793                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
 794
 795                                 //write out names to file
 796                                 string filename = toString(getpid()) + ".temp";
 797                                 ofstream out;
 798                                 openOutputFile(filename, out);
 799                                 out << tag << endl;
 800                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
 801                                 out.close();
 802
 803                                 //print out labels
 804                                 ofstream outLabels;
 805                                 filename = toString(getpid()) + ".temp.labels";
 806                                 openOutputFile(filename, outLabels);
 807
 808                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
 809                                         outLabels << (*it) << endl;
 810                                 }
 811                                 outLabels.close();
 812
 813                                 exit(0);
 814                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
 815                 }
 816
 817                 //force parent to wait until all the processes are done
 818                 for (int i=0;i<processors;i++) {
 819                         int temp = processIDS[i];
 820                         wait(&temp);
 821                 }
 822
 823                 return exitCommand;
 824         #endif
 825
 826         }
 827         catch(exception& e) {
 828                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
 829                 exit(1);
 830         }
 831 }
 832 //**********************************************************************************************************************
 833
 834 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
 835         try {
 836                 Cluster* cluster;
 837                 SparseMatrix* matrix;
 838                 ListVector* list;
 839                 ListVector oldList;
 840                 RAbundVector* rabund;
 841
 842                 vector<string> listFileNames;
 843
 844                 //cluster each distance file
 845                 for (int i = 0; i < distNames.size(); i++) {
 846
 847                         string thisNamefile = distNames[i].begin()->second;
 848                         string thisDistFile = distNames[i].begin()->first;
 849
 850                         //read in distance file
 851                         globaldata->setNameFile(thisNamefile);
 852                         globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
 853
 854                         #ifdef USE_MPI
 855                                 int pid;
 856                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 857
 858                                 //output your files too
 859                                 if (pid != 0) {
 860                                         cout << endl << "Reading " << thisDistFile << endl;
 861                                 }
 862                         #endif
 863
 864                         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
 865
 866                         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
 867                         read->setCutoff(cutoff);
 868
 869                         NameAssignment* nameMap = new NameAssignment(thisNamefile);
 870                         nameMap->readMap();
 871                         read->read(nameMap);
 872
 873                         if (m->control_pressed) {  delete read; delete nameMap; return listFileNames; }
 874
 875                         list = read->getListVector();
 876                         oldList = *list;
 877                         matrix = read->getMatrix();
 878
 879                         delete read;
 880                         delete nameMap;
 881
 882
 883                         #ifdef USE_MPI
 884                                 //output your files too
 885                                 if (pid != 0) {
 886                                         cout << endl << "Clustering " << thisDistFile << endl;
 887                                 }
 888                         #endif
 889
 890                         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
 891
 892                         rabund = new RAbundVector(list->getRAbundVector());
 893
 894                         //create cluster
 895                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
 896                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
 897                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
 898                         tag = cluster->getTag();
 899
 900                         if (outputDir == "") { outputDir += hasPath(thisDistFile); }
 901                         fileroot = outputDir + getRootName(getSimpleName(thisDistFile));
 902
 903                         ofstream listFile;
 904                         openOutputFile(fileroot+ tag + ".list", listFile);
 905
 906                         listFileNames.push_back(fileroot+ tag + ".list");
 907
 908                         time_t estart = time(NULL);
 909
 910                         float previousDist = 0.00000;
 911                         float rndPreviousDist = 0.00000;
 912
 913                         oldList = *list;
 914
 915                         print_start = true;
 916                         start = time(NULL);
 917                         double saveCutoff = cutoff;
 918
 919                         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
 920
 921                                 if (m->control_pressed) { //clean up
 922                                         delete matrix; delete list;     delete cluster; delete rabund;
 923                                         listFile.close();
 924                                         for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
 925                                         listFileNames.clear(); return listFileNames;
 926                                 }
 927
 928                                 cluster->update(cutoff);
 929
 930                                 float dist = matrix->getSmallDist();
 931                                 float rndDist;
 932                                 if (hard) {
 933                                         rndDist = ceilDist(dist, precision);
 934                                 }else{
 935                                         rndDist = roundDist(dist, precision);
 936                                 }
 937
 938                                 if(previousDist <= 0.0000 && dist != previousDist){
 939                                         oldList.setLabel("unique");
 940                                         oldList.print(listFile);
 941                                         if (labels.count("unique") == 0) {  labels.insert("unique");  }
 942                                 }
 943                                 else if(rndDist != rndPreviousDist){
 944                                         oldList.setLabel(toString(rndPreviousDist,  length-1));
 945                                         oldList.print(listFile);
 946                                         if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
 947                                 }
 948
 949                                 previousDist = dist;
 950                                 rndPreviousDist = rndDist;
 951                                 oldList = *list;
 952                         }
 953
 954
 955                         if(previousDist <= 0.0000){
 956                                 oldList.setLabel("unique");
 957                                 oldList.print(listFile);
 958                                 if (labels.count("unique") == 0) { labels.insert("unique"); }
 959                         }
 960                         else if(rndPreviousDist<cutoff){
 961                                 oldList.setLabel(toString(rndPreviousDist,  length-1));
 962                                 oldList.print(listFile);
 963                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
 964                         }
 965
 966                         delete matrix; delete list;     delete cluster; delete rabund;
 967                         listFile.close();
 968
 969                         if (m->control_pressed) { //clean up
 970                                 for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
 971                                 listFileNames.clear(); return listFileNames;
 972                         }
 973
 974                         remove(thisDistFile.c_str());
 975                         remove(thisNamefile.c_str());
 976                 }
 977
 978
 979                 return listFileNames;
 980
 981         }
 982         catch(exception& e) {
 983                 m->errorOut(e, "ClusterSplitCommand", "cluster");
 984                 exit(1);
 985         }
 986
 987
 988 }
 989
 990 //**********************************************************************************************************************