clustersplitcommand.cpp

   1 /*
   2  *  clustersplitcommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 5/19/10.
   6  *  Copyright 2010 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "clustersplitcommand.h"
  11
  12
  13 //**********************************************************************************************************************
  14 vector<string> ClusterSplitCommand::setParameters(){
  15         try {
  16                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
  17                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
  18                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
  19                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
  20         CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "",false,false); parameters.push_back(pcount);
  21                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
  22                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
  23                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
  24                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
  25                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
  26                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
  27                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
  28                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff);
  29                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
  30                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
  31                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
  32         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pclassic);
  33                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
  34                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
  35
  36                 vector<string> myArray;
  37                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
  38                 return myArray;
  39         }
  40         catch(exception& e) {
  41                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
  42                 exit(1);
  43         }
  44 }
  45 //**********************************************************************************************************************
  46 string ClusterSplitCommand::getHelpString(){
  47         try {
  48                 string helpString = "";
  49                 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n";
  50                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
  51                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
  52                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
  53                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
  54                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
  55                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
  56                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
  57                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
  58                 helpString += "The name parameter allows you to enter your name file. \n";
  59         helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
  60                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
  61                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
  62                 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
  63                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
  64                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
  65                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
  66                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
  67         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic.  It is only valid with splitmethod=fasta. Default=f.\n";
  68 #ifdef USE_MPI
  69                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
  70 #endif
  71                 helpString += "The cluster.split command should be in the following format: \n";
  72                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
  73                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
  74                 return helpString;
  75         }
  76         catch(exception& e) {
  77                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
  78                 exit(1);
  79         }
  80 }
  81 //**********************************************************************************************************************
  82 string ClusterSplitCommand::getOutputFileNameTag(string type, string inputName=""){
  83         try {
  84         string outputFileName = "";
  85                 map<string, vector<string> >::iterator it;
  86
  87         //is this a type this command creates
  88         it = outputTypes.find(type);
  89         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
  90         else {
  91             if (type == "list") {  outputFileName =  "list"; }
  92             else if (type == "rabund") {  outputFileName =  "rabund"; }
  93             else if (type == "sabund") {  outputFileName =  "sabund"; }
  94             else if (type == "column") {  outputFileName =  "dist"; }
  95             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
  96         }
  97         return outputFileName;
  98         }
  99         catch(exception& e) {
 100                 m->errorOut(e, "ClusterSplitCommand", "getOutputFileNameTag");
 101                 exit(1);
 102         }
 103 }
 104 //**********************************************************************************************************************
 105 ClusterSplitCommand::ClusterSplitCommand(){
 106         try {
 107                 abort = true; calledHelp = true;
 108                 setParameters();
 109                 vector<string> tempOutNames;
 110                 outputTypes["list"] = tempOutNames;
 111                 outputTypes["rabund"] = tempOutNames;
 112                 outputTypes["sabund"] = tempOutNames;
 113                 outputTypes["column"] = tempOutNames;
 114         }
 115         catch(exception& e) {
 116                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 117                 exit(1);
 118         }
 119 }
 120 //**********************************************************************************************************************
 121 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
 122 ClusterSplitCommand::ClusterSplitCommand(string option)  {
 123         try{
 124                 abort = false; calledHelp = false;
 125                 format = "";
 126
 127                 //allow user to run help
 128                 if(option == "help") { help(); abort = true; calledHelp = true; }
 129                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
 130
 131                 else {
 132                         vector<string> myArray = setParameters();
 133
 134                         OptionParser parser(option);
 135                         map<string,string> parameters = parser.getParameters();
 136
 137                         ValidParameters validParameter("cluster.split");
 138
 139                         //check to make sure all parameters are valid for command
 140                         map<string,string>::iterator it;
 141                         for (it = parameters.begin(); it != parameters.end(); it++) {
 142                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
 143                                         abort = true;
 144                                 }
 145                         }
 146
 147                         //initialize outputTypes
 148                         vector<string> tempOutNames;
 149                         outputTypes["list"] = tempOutNames;
 150                         outputTypes["rabund"] = tempOutNames;
 151                         outputTypes["sabund"] = tempOutNames;
 152                         outputTypes["column"] = tempOutNames;
 153
 154                         //if the user changes the output directory command factory will send this info to us in the output parameter
 155                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
 156
 157                                 //if the user changes the input directory command factory will send this info to us in the output parameter
 158                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
 159                         if (inputDir == "not found"){   inputDir = "";          }
 160                         else {
 161                                 string path;
 162                                 it = parameters.find("phylip");
 163                                 //user has given a template file
 164                                 if(it != parameters.end()){
 165                                         path = m->hasPath(it->second);
 166                                         //if the user has not given a path then, add inputdir. else leave path alone.
 167                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
 168                                 }
 169
 170                                 it = parameters.find("column");
 171                                 //user has given a template file
 172                                 if(it != parameters.end()){
 173                                         path = m->hasPath(it->second);
 174                                         //if the user has not given a path then, add inputdir. else leave path alone.
 175                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
 176                                 }
 177
 178                                 it = parameters.find("name");
 179                                 //user has given a template file
 180                                 if(it != parameters.end()){
 181                                         path = m->hasPath(it->second);
 182                                         //if the user has not given a path then, add inputdir. else leave path alone.
 183                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
 184                                 }
 185
 186                                 it = parameters.find("taxonomy");
 187                                 //user has given a template file
 188                                 if(it != parameters.end()){
 189                                         path = m->hasPath(it->second);
 190                                         //if the user has not given a path then, add inputdir. else leave path alone.
 191                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
 192                                 }
 193
 194                                 it = parameters.find("fasta");
 195                                 //user has given a template file
 196                                 if(it != parameters.end()){
 197                                         path = m->hasPath(it->second);
 198                                         //if the user has not given a path then, add inputdir. else leave path alone.
 199                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
 200                                 }
 201
 202                 it = parameters.find("count");
 203                                 //user has given a template file
 204                                 if(it != parameters.end()){
 205                                         path = m->hasPath(it->second);
 206                                         //if the user has not given a path then, add inputdir. else leave path alone.
 207                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
 208                                 }
 209                         }
 210
 211                         //check for required parameters
 212                         phylipfile = validParameter.validFile(parameters, "phylip", true);
 213                         if (phylipfile == "not open") { abort = true; }
 214                         else if (phylipfile == "not found") { phylipfile = ""; }
 215                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
 216
 217                         columnfile = validParameter.validFile(parameters, "column", true);
 218                         if (columnfile == "not open") { abort = true; }
 219                         else if (columnfile == "not found") { columnfile = ""; }
 220                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
 221
 222                         namefile = validParameter.validFile(parameters, "name", true);
 223                         if (namefile == "not open") { abort = true; namefile = "";}
 224                         else if (namefile == "not found") { namefile = "";  }
 225                         else { m->setNameFile(namefile); }
 226
 227             countfile = validParameter.validFile(parameters, "count", true);
 228                         if (countfile == "not open") { abort = true; countfile = "";}
 229                         else if (countfile == "not found") { countfile = "";  }
 230                         else { m->setCountTableFile(countfile); }
 231
 232                         fastafile = validParameter.validFile(parameters, "fasta", true);
 233                         if (fastafile == "not open") { abort = true; }
 234                         else if (fastafile == "not found") { fastafile = ""; }
 235                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
 236
 237                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
 238                         if (taxFile == "not open") { taxFile = ""; abort = true; }
 239                         else if (taxFile == "not found") { taxFile = ""; }
 240                         else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
 241
 242                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
 243                                 //is there are current file available for either of these?
 244                                 //give priority to column, then phylip, then fasta
 245                                 columnfile = m->getColumnFile();
 246                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
 247                                 else {
 248                                         phylipfile = m->getPhylipFile();
 249                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
 250                                         else {
 251                                                 fastafile = m->getFastaFile();
 252                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
 253                                                 else {
 254                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
 255                                                         abort = true;
 256                                                 }
 257                                         }
 258                                 }
 259                         }
 260                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
 261
 262             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
 263
 264                         if (columnfile != "") {
 265                                 if ((namefile == "") && (countfile == "")) {
 266                                         namefile = m->getNameFile();
 267                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 268                                         else {
 269                                                 countfile = m->getCountTableFile();
 270                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
 271                         else {
 272                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
 273                             abort = true;
 274                         }
 275                                         }
 276                                 }
 277                         }
 278
 279                         if (fastafile != "") {
 280                                 if (taxFile == "") {
 281                                         taxFile = m->getTaxonomyFile();
 282                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
 283                                         else {
 284                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
 285                                                 abort = true;
 286                                         }
 287                                 }
 288
 289                                 if ((namefile == "") && (countfile == "")) {
 290                                         namefile = m->getNameFile();
 291                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 292                                         else {
 293                                                 countfile = m->getCountTableFile();
 294                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
 295                         else {
 296                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine();
 297                             abort = true;
 298                         }
 299                                         }
 300                                 }
 301                         }
 302
 303                         //check for optional parameter and set defaults
 304                         // ...at some point should added some additional type checking...
 305                         //get user cutoff and precision or use defaults
 306                         string temp;
 307                         temp = validParameter.validFile(parameters, "precision", false);
 308                         if (temp == "not found") { temp = "100"; }
 309                         //saves precision legnth for formatting below
 310                         length = temp.length();
 311                         m->mothurConvert(temp, precision);
 312
 313                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
 314                         hard = m->isTrue(temp);
 315
 316                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
 317                         large = m->isTrue(temp);
 318
 319                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
 320                         m->setProcessors(temp);
 321                         m->mothurConvert(temp, processors);
 322
 323                         temp = validParameter.validFile(parameters, "splitmethod", false);
 324                         if ((splitmethod != "fasta") && (splitmethod != "classify")) {
 325                                 if (temp == "not found")  { splitmethod = "distance"; }
 326                                 else {  splitmethod = temp; }
 327                         }
 328
 329             temp = validParameter.validFile(parameters, "classic", false);                      if (temp == "not found") { temp = "F"; }
 330                         classic = m->isTrue(temp);
 331
 332             if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
 333
 334                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
 335                         m->mothurConvert(temp, cutoff);
 336                         cutoff += (5 / (precision * 10.0));
 337
 338                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
 339                         m->mothurConvert(temp, taxLevelCutoff);
 340
 341                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
 342
 343                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
 344                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 345
 346                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
 347                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
 348
 349                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
 350
 351                         showabund = validParameter.validFile(parameters, "showabund", false);
 352                         if (showabund == "not found") { showabund = "T"; }
 353
 354                         timing = validParameter.validFile(parameters, "timing", false);
 355                         if (timing == "not found") { timing = "F"; }
 356
 357                 }
 358         }
 359         catch(exception& e) {
 360                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 361                 exit(1);
 362         }
 363 }
 364
 365 //**********************************************************************************************************************
 366
 367 int ClusterSplitCommand::execute(){
 368         try {
 369
 370                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
 371
 372                 time_t estart;
 373                 vector<string> listFileNames;
 374                 set<string> labels;
 375                 string singletonName = "";
 376                 double saveCutoff = cutoff;
 377
 378                 //****************** file prep work ******************************//
 379                 #ifdef USE_MPI
 380                         int pid;
 381                         int tag = 2001;
 382                         MPI_Status status;
 383                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
 384                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 385
 386                         if (pid == 0) { //only process 0 converts and splits
 387
 388                 #endif
 389
 390                 //if user gave a phylip file convert to column file
 391                 if (format == "phylip") {
 392                         estart = time(NULL);
 393                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
 394
 395                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
 396
 397                         NameAssignment* nameMap = NULL;
 398                         convert->setFormat("phylip");
 399                         convert->read(nameMap);
 400
 401                         if (m->control_pressed) {  delete convert;  return 0;  }
 402
 403                         distfile = convert->getOutputFile();
 404
 405                         //if no names file given with phylip file, create it
 406                         ListVector* listToMakeNameFile =  convert->getListVector();
 407                         if ((namefile == "") && (countfile == "")) {  //you need to make a namefile for split matrix
 408                                 ofstream out;
 409                                 namefile = phylipfile + ".names";
 410                                 m->openOutputFile(namefile, out);
 411                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
 412                                         string bin = listToMakeNameFile->get(i);
 413                                         out << bin << '\t' << bin << endl;
 414                                 }
 415                                 out.close();
 416                         }
 417                         delete listToMakeNameFile;
 418                         delete convert;
 419
 420                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
 421                 }
 422                 if (m->control_pressed) { return 0; }
 423
 424                 estart = time(NULL);
 425                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
 426
 427                 //split matrix into non-overlapping groups
 428                 SplitMatrix* split;
 429                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large);                                                    }
 430                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large);                                    }
 431                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);  }
 432                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
 433
 434                 split->split();
 435
 436                 if (m->control_pressed) { delete split; return 0; }
 437
 438                 singletonName = split->getSingletonNames();
 439                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
 440                 delete split;
 441
 442                 //output a merged distance file
 443                 if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
 444
 445
 446                 if (m->control_pressed) { return 0; }
 447
 448                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
 449                 estart = time(NULL);
 450
 451                 //****************** break up files between processes and cluster each file set ******************************//
 452         #ifdef USE_MPI
 453                         ////you are process 0 from above////
 454
 455                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 456                         dividedNames.resize(processors);
 457
 458                         //for each file group figure out which process will complete it
 459                         //want to divide the load intelligently so the big files are spread between processes
 460                         for (int i = 0; i < distName.size(); i++) {
 461                                 int processToAssign = (i+1) % processors;
 462                                 if (processToAssign == 0) { processToAssign = processors; }
 463
 464                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 465                         }
 466
 467                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 468                         for (int i = 0; i < processors; i++) {
 469                                 int remainder = ((i+1) % processors);
 470                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 471                         }
 472
 473
 474                         //send each child the list of files it needs to process
 475                         for(int i = 1; i < processors; i++) {
 476                                 //send number of file pairs
 477                                 int num = dividedNames[i].size();
 478                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 479
 480                                 for (int j = 0; j < num; j++) { //send filenames to process i
 481                                         char tempDistFileName[1024];
 482                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
 483                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
 484
 485                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 486                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 487
 488                                         char tempNameFileName[1024];
 489                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
 490                                         int lengthName = (dividedNames[i][j].begin()->second).length();
 491
 492                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 493                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 494                                 }
 495                         }
 496
 497                         //process your share
 498                         listFileNames = cluster(dividedNames[0], labels);
 499
 500                         //receive the other processes info
 501                         for(int i = 1; i < processors; i++) {
 502                                 int num = dividedNames[i].size();
 503
 504                                 double tempCutoff;
 505                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
 506                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
 507
 508                                 //send list filenames to root process
 509                                 for (int j = 0; j < num; j++) {
 510                                         int lengthList = 0;
 511                                         char tempListFileName[1024];
 512
 513                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 514                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 515
 516                                         string myListFileName = tempListFileName;
 517                                         myListFileName = myListFileName.substr(0, lengthList);
 518
 519                                         listFileNames.push_back(myListFileName);
 520                                 }
 521
 522                                 //send Labels to root process
 523                                 int numLabels = 0;
 524                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 525
 526                                 for (int j = 0; j < numLabels; j++) {
 527                                         int lengthLabel = 0;
 528                                         char tempLabel[100];
 529
 530                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 531                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 532
 533                                         string myLabel = tempLabel;
 534                                         myLabel = myLabel.substr(0, lengthLabel);
 535
 536                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
 537                                 }
 538                         }
 539
 540                 }else { //you are a child process
 541                         vector < map<string, string> >  myNames;
 542
 543                         //recieve the files you need to process
 544                         //receive number of file pairs
 545                         int num = 0;
 546                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 547
 548                         myNames.resize(num);
 549
 550                         for (int j = 0; j < num; j++) { //receive filenames to process
 551                                 int lengthDist = 0;
 552                                 char tempDistFileName[1024];
 553
 554                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 555                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 556
 557                                 string myDistFileName = tempDistFileName;
 558                                 myDistFileName = myDistFileName.substr(0, lengthDist);
 559
 560                                 int lengthName = 0;
 561                                 char tempNameFileName[1024];
 562
 563                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 564                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 565
 566                                 string myNameFileName = tempNameFileName;
 567                                 myNameFileName = myNameFileName.substr(0, lengthName);
 568
 569                                 //save file name
 570                                 myNames[j][myDistFileName] = myNameFileName;
 571                         }
 572
 573                         //process them
 574                         listFileNames = cluster(myNames, labels);
 575
 576                         //send cutoff
 577                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
 578
 579                         //send list filenames to root process
 580                         for (int j = 0; j < num; j++) {
 581                                 char tempListFileName[1024];
 582                                 strcpy(tempListFileName, listFileNames[j].c_str());
 583                                 int lengthList = listFileNames[j].length();
 584
 585                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 586                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 587                         }
 588
 589                         //send Labels to root process
 590                         int numLabels = labels.size();
 591                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 592
 593                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
 594                                 char tempLabel[100];
 595                                 strcpy(tempLabel, (*it).c_str());
 596                                 int lengthLabel = (*it).length();
 597
 598                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 599                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 600                         }
 601                 }
 602
 603                 //make everyone wait
 604                 MPI_Barrier(MPI_COMM_WORLD);
 605
 606         #else
 607                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
 608                 //sanity check
 609                 if (processors > distName.size()) { processors = distName.size(); }
 610
 611                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 612                                 if(processors == 1){
 613                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 614                                 }else{
 615                                         listFileNames = createProcesses(distName, labels);
 616                 }
 617                 #else
 618                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 619                 #endif
 620         #endif
 621                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
 622
 623                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
 624
 625                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
 626
 627                 //****************** merge list file and create rabund and sabund files ******************************//
 628                 estart = time(NULL);
 629                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
 630
 631                 #ifdef USE_MPI
 632                         if (pid == 0) { //only process 0 merges
 633                 #endif
 634
 635                 ListVector* listSingle;
 636                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 637
 638                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 639
 640                 mergeLists(listFileNames, labelBins, listSingle);
 641
 642                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 643
 644                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 645
 646                 //set list file as new current listfile
 647                 string current = "";
 648                 itTypes = outputTypes.find("list");
 649                 if (itTypes != outputTypes.end()) {
 650                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
 651                 }
 652
 653                 //set rabund file as new current rabundfile
 654                 itTypes = outputTypes.find("rabund");
 655                 if (itTypes != outputTypes.end()) {
 656                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
 657                 }
 658
 659                 //set sabund file as new current sabundfile
 660                 itTypes = outputTypes.find("sabund");
 661                 if (itTypes != outputTypes.end()) {
 662                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
 663                 }
 664
 665                 m->mothurOutEndLine();
 666                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 667                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
 668                 m->mothurOutEndLine();
 669
 670                 #ifdef USE_MPI
 671                         } //only process 0 merges
 672
 673                         //make everyone wait
 674                         MPI_Barrier(MPI_COMM_WORLD);
 675                 #endif
 676
 677                 return 0;
 678         }
 679         catch(exception& e) {
 680                 m->errorOut(e, "ClusterSplitCommand", "execute");
 681                 exit(1);
 682         }
 683 }
 684 //**********************************************************************************************************************
 685 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
 686         try {
 687
 688                 map<float, int> labelBin;
 689                 vector<float> orderFloat;
 690                 int numSingleBins;
 691
 692                 //read in singletons
 693                 if (singleton != "none") {
 694
 695             ifstream in;
 696             m->openInputFile(singleton, in);
 697
 698                         string firstCol, secondCol;
 699                         listSingle = new ListVector();
 700
 701             if (countfile != "") { m->getline(in); m->gobble(in); }
 702
 703                         while (!in.eof()) {
 704                                 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
 705                                 if (countfile == "") { listSingle->push_back(secondCol); }
 706                 else { listSingle->push_back(firstCol); }
 707                         }
 708
 709                         in.close();
 710                         m->mothurRemove(singleton);
 711
 712                         numSingleBins = listSingle->getNumBins();
 713                 }else{  listSingle = NULL; numSingleBins = 0;  }
 714
 715                 //go through users set and make them floats so we can sort them
 716                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
 717                         float temp = -10.0;
 718
 719                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
 720                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
 721
 722                         if (temp <= cutoff) {
 723                                 orderFloat.push_back(temp);
 724                                 labelBin[temp] = numSingleBins; //initialize numbins
 725                         }
 726                 }
 727
 728                 //sort order
 729                 sort(orderFloat.begin(), orderFloat.end());
 730                 userLabels.clear();
 731
 732                 //get the list info from each file
 733                 for (int k = 0; k < listNames.size(); k++) {
 734
 735                         if (m->control_pressed) {
 736                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
 737                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
 738                                 return labelBin;
 739                         }
 740
 741                         InputData* input = new InputData(listNames[k], "list");
 742                         ListVector* list = input->getListVector();
 743                         string lastLabel = list->getLabel();
 744
 745                         string filledInList = listNames[k] + "filledInTemp";
 746                         ofstream outFilled;
 747                         m->openOutputFile(filledInList, outFilled);
 748
 749                         //for each label needed
 750                         for(int l = 0; l < orderFloat.size(); l++){
 751
 752                                 string thisLabel;
 753                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
 754                                 else { thisLabel = toString(orderFloat[l],  length-1);  }
 755
 756                                 //this file has reached the end
 757                                 if (list == NULL) {
 758                                         list = input->getListVector(lastLabel, true);
 759                                 }else{  //do you have the distance, or do you need to fill in
 760
 761                                         float labelFloat;
 762                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
 763                                         else { convert(list->getLabel(), labelFloat); }
 764
 765                                         //check for missing labels
 766                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
 767                                                 //if its bigger get last label, otherwise keep it
 768                                                 delete list;
 769                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
 770                                         }
 771                                         lastLabel = list->getLabel();
 772                                 }
 773
 774                                 //print to new file
 775                                 list->setLabel(thisLabel);
 776                                 list->print(outFilled);
 777
 778                                 //update labelBin
 779                                 labelBin[orderFloat[l]] += list->getNumBins();
 780
 781                                 delete list;
 782
 783                                 list = input->getListVector();
 784                         }
 785
 786                         if (list != NULL) { delete list; }
 787                         delete input;
 788
 789                         outFilled.close();
 790                         m->mothurRemove(listNames[k]);
 791                         rename(filledInList.c_str(), listNames[k].c_str());
 792                 }
 793
 794                 return labelBin;
 795         }
 796         catch(exception& e) {
 797                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
 798                 exit(1);
 799         }
 800 }
 801 //**********************************************************************************************************************
 802 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
 803         try {
 804                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
 805                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
 806
 807         string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund");
 808         string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund");
 809         string listFileName = fileroot+ tag + ".";
 810         if (countfile != "") { listFileName += "unique_"; }
 811         listFileName += getOutputFileNameTag("list");
 812
 813         if (countfile == "") {
 814             m->openOutputFile(sabundFileName,   outSabund);
 815             m->openOutputFile(rabundFileName,   outRabund);
 816             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
 817             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
 818
 819         }
 820                 m->openOutputFile(listFileName, outList);
 821         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
 822
 823
 824                 map<float, int>::iterator itLabel;
 825
 826                 //for each label needed
 827                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
 828
 829                         string thisLabel;
 830                         if (itLabel->first == -1) { thisLabel = "unique"; }
 831                         else { thisLabel = toString(itLabel->first,  length-1);  }
 832
 833                         outList << thisLabel << '\t' << itLabel->second << '\t';
 834
 835             RAbundVector* rabund = NULL;
 836             if (countfile == "") {
 837                 rabund = new RAbundVector();
 838                 rabund->setLabel(thisLabel);
 839             }
 840
 841                         //add in singletons
 842                         if (listSingle != NULL) {
 843                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
 844                                         outList << listSingle->get(j) << '\t';
 845                                         if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
 846                                 }
 847                         }
 848
 849                         //get the list info from each file
 850                         for (int k = 0; k < listNames.size(); k++) {
 851
 852                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } if (rabund != NULL) { delete rabund; } return 0; }
 853
 854                                 InputData* input = new InputData(listNames[k], "list");
 855                                 ListVector* list = input->getListVector(thisLabel);
 856
 857                                 //this file has reached the end
 858                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }
 859                                 else {
 860                                         for (int j = 0; j < list->getNumBins(); j++) {
 861                                                 outList << list->get(j) << '\t';
 862                                                 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
 863                                         }
 864                                         delete list;
 865                                 }
 866                                 delete input;
 867                         }
 868
 869             if (countfile == "") {
 870                 SAbundVector sabund = rabund->getSAbundVector();
 871                 sabund.print(outSabund);
 872                 rabund->print(outRabund);
 873             }
 874                         outList << endl;
 875
 876                         if (rabund != NULL) { delete rabund; }
 877                 }
 878
 879                 outList.close();
 880         if (countfile == "") {
 881             outRabund.close();
 882             outSabund.close();
 883                 }
 884                 if (listSingle != NULL) { delete listSingle;  }
 885
 886                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
 887
 888                 return 0;
 889         }
 890         catch(exception& e) {
 891                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
 892                 exit(1);
 893         }
 894 }
 895
 896 //**********************************************************************************************************************
 897
 898 void ClusterSplitCommand::printData(ListVector* oldList){
 899         try {
 900                 string label = oldList->getLabel();
 901                 RAbundVector oldRAbund = oldList->getRAbundVector();
 902
 903                 oldRAbund.setLabel(label);
 904                 if (m->isTrue(showabund)) {
 905                         oldRAbund.getSAbundVector().print(cout);
 906                 }
 907                 oldRAbund.print(outRabund);
 908                 oldRAbund.getSAbundVector().print(outSabund);
 909
 910                 oldList->print(outList);
 911         }
 912         catch(exception& e) {
 913                 m->errorOut(e, "ClusterSplitCommand", "printData");
 914                 exit(1);
 915         }
 916 }
 917 //**********************************************************************************************************************
 918 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
 919         try {
 920
 921         vector<string> listFiles;
 922         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 923         dividedNames.resize(processors);
 924
 925         //for each file group figure out which process will complete it
 926         //want to divide the load intelligently so the big files are spread between processes
 927         for (int i = 0; i < distName.size(); i++) {
 928             //cout << i << endl;
 929             int processToAssign = (i+1) % processors;
 930             if (processToAssign == 0) { processToAssign = processors; }
 931
 932             dividedNames[(processToAssign-1)].push_back(distName[i]);
 933             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
 934         }
 935
 936         //not lets reverse the order of ever other process, so we balance big files running with little ones
 937         for (int i = 0; i < processors; i++) {
 938             //cout << i << endl;
 939             int remainder = ((i+1) % processors);
 940             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 941         }
 942
 943         if (m->control_pressed) { return listFiles; }
 944
 945         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 946                 int process = 1;
 947                 processIDS.clear();
 948
 949                 //loop through and create all the processes you want
 950                 while (process != processors) {
 951                         int pid = fork();
 952
 953                         if (pid > 0) {
 954                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 955                                 process++;
 956                         }else if (pid == 0){
 957                                 set<string> labels;
 958                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
 959
 960                                 //write out names to file
 961                                 string filename = toString(getpid()) + ".temp";
 962                                 ofstream out;
 963                                 m->openOutputFile(filename, out);
 964                                 out << tag << endl;
 965                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
 966                                 out.close();
 967
 968                                 //print out labels
 969                                 ofstream outLabels;
 970                                 filename = toString(getpid()) + ".temp.labels";
 971                                 m->openOutputFile(filename, outLabels);
 972
 973                                 outLabels << cutoff << endl;
 974                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
 975                                         outLabels << (*it) << endl;
 976                                 }
 977                                 outLabels.close();
 978
 979                                 exit(0);
 980                         }else {
 981                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
 982                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
 983                                 exit(0);
 984                         }
 985                 }
 986
 987         //do your part
 988         listFiles = cluster(dividedNames[0], labels);
 989
 990                 //force parent to wait until all the processes are done
 991                 for (int i=0;i< processIDS.size();i++) {
 992                         int temp = processIDS[i];
 993                         wait(&temp);
 994                 }
 995
 996         //get list of list file names from each process
 997         for(int i=0;i<processIDS.size();i++){
 998             string filename = toString(processIDS[i]) + ".temp";
 999             ifstream in;
1000             m->openInputFile(filename, in);
1001
1002             in >> tag; m->gobble(in);
1003
1004             while(!in.eof()) {
1005                 string tempName;
1006                 in >> tempName; m->gobble(in);
1007                 listFiles.push_back(tempName);
1008             }
1009             in.close();
1010             m->mothurRemove((toString(processIDS[i]) + ".temp"));
1011
1012             //get labels
1013             filename = toString(processIDS[i]) + ".temp.labels";
1014             ifstream in2;
1015             m->openInputFile(filename, in2);
1016
1017             float tempCutoff;
1018             in2 >> tempCutoff; m->gobble(in2);
1019             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1020
1021             while(!in2.eof()) {
1022                 string tempName;
1023                 in2 >> tempName; m->gobble(in2);
1024                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1025             }
1026             in2.close();
1027             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1028         }
1029
1030
1031     #else
1032
1033         //////////////////////////////////////////////////////////////////////////////////////////////////////
1034                 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
1035                 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1036                 //Taking advantage of shared memory to allow both threads to add labels.
1037                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1038                 /*
1039                 vector<clusterData*> pDataArray;
1040                 DWORD   dwThreadIdArray[processors-1];
1041                 HANDLE  hThreadArray[processors-1];
1042
1043                 //Create processor worker threads.
1044                 for( int i=1; i<processors; i++ ){
1045                         // Allocate memory for thread data.
1046                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1047                         pDataArray.push_back(tempCluster);
1048                         processIDS.push_back(i);
1049
1050                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1051                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1052                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
1053
1054                 }
1055
1056         //do your part
1057         listFiles = cluster(dividedNames[0], labels);
1058
1059                 //Wait until all threads have terminated.
1060                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1061
1062                 //Close all thread handles and free memory allocations.
1063                 for(int i=0; i < pDataArray.size(); i++){
1064             //get tag
1065             tag = pDataArray[i]->tag;
1066             //get listfiles created
1067             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1068             //get labels
1069             set<string>::iterator it;
1070             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1071                         //check cutoff
1072             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1073                         CloseHandle(hThreadArray[i]);
1074                         delete pDataArray[i];
1075                 }
1076 */
1077         #endif
1078
1079         return listFiles;
1080
1081         }
1082         catch(exception& e) {
1083                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1084                 exit(1);
1085         }
1086 }
1087 //**********************************************************************************************************************
1088
1089 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1090         try {
1091
1092                 vector<string> listFileNames;
1093                 double smallestCutoff = cutoff;
1094
1095                 //cluster each distance file
1096                 for (int i = 0; i < distNames.size(); i++) {
1097
1098                         string thisNamefile = distNames[i].begin()->second;
1099                         string thisDistFile = distNames[i].begin()->first;
1100
1101                         string listFileName = "";
1102             if (classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff);   }
1103             else            {  listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff);          }
1104
1105                         if (m->control_pressed) { //clean up
1106                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1107                                 listFileNames.clear(); return listFileNames;
1108                         }
1109
1110             listFileNames.push_back(listFileName);
1111         }
1112
1113                 cutoff = smallestCutoff;
1114
1115                 return listFileNames;
1116
1117         }
1118         catch(exception& e) {
1119                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1120                 exit(1);
1121         }
1122
1123
1124 }
1125 //**********************************************************************************************************************
1126 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1127         try {
1128         string listFileName = "";
1129
1130         ListVector* list = NULL;
1131         ListVector oldList;
1132         RAbundVector* rabund = NULL;
1133
1134 #ifdef USE_MPI
1135         int pid;
1136         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1137
1138         //output your files too
1139         if (pid != 0) {
1140             cout << endl << "Reading " << thisDistFile << endl;
1141         }
1142 #endif
1143
1144         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1145
1146         //reads phylip file storing data in 2D vector, also fills list and rabund
1147         bool sim = false;
1148                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1149
1150         NameAssignment* nameMap = NULL;
1151         CountTable* ct = NULL;
1152         if(namefile != ""){
1153                         nameMap = new NameAssignment(thisNamefile);
1154                         nameMap->readMap();
1155             cluster->readPhylipFile(thisDistFile, nameMap);
1156                 }else if (countfile != "") {
1157             ct = new CountTable();
1158             ct->readTable(thisNamefile);
1159             cluster->readPhylipFile(thisDistFile, ct);
1160         }
1161         tag = cluster->getTag();
1162
1163                 if (m->control_pressed) { if(namefile != ""){   delete nameMap; }
1164             else { delete ct; } delete cluster; return 0; }
1165
1166                 list = cluster->getListVector();
1167                 rabund = cluster->getRAbundVector();
1168
1169                 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1170                 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1171         listFileName = fileroot+ tag + ".list";
1172
1173         ofstream listFile;
1174                 m->openOutputFile(fileroot+ tag + ".list",      listFile);
1175
1176                 float previousDist = 0.00000;
1177                 float rndPreviousDist = 0.00000;
1178                 oldList = *list;
1179
1180 #ifdef USE_MPI
1181         //output your files too
1182         if (pid != 0) {
1183             cout << endl << "Clustering " << thisDistFile << endl;
1184         }
1185 #endif
1186
1187         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1188
1189                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1190                         if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  if(namefile != ""){    delete nameMap; }
1191                 else { delete ct; } return listFileName;  }
1192
1193                         cluster->update(cutoff);
1194
1195                         float dist = cluster->getSmallDist();
1196                         float rndDist;
1197                         if (hard) {
1198                                 rndDist = m->ceilDist(dist, precision);
1199                         }else{
1200                                 rndDist = m->roundDist(dist, precision);
1201                         }
1202
1203             if(previousDist <= 0.0000 && dist != previousDist){
1204                 oldList.setLabel("unique");
1205                 oldList.print(listFile);
1206                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1207             }
1208             else if(rndDist != rndPreviousDist){
1209                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1210                 oldList.print(listFile);
1211                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1212             }
1213
1214
1215                         previousDist = dist;
1216                         rndPreviousDist = rndDist;
1217                         oldList = *list;
1218                 }
1219
1220                 if(previousDist <= 0.0000){
1221             oldList.setLabel("unique");
1222             oldList.print(listFile);
1223             if (labels.count("unique") == 0) { labels.insert("unique"); }
1224         }
1225         else if(rndPreviousDist<cutoff){
1226             oldList.setLabel(toString(rndPreviousDist,  length-1));
1227             oldList.print(listFile);
1228             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1229         }
1230
1231
1232                 listFile.close();
1233
1234                 delete cluster;  delete list; delete rabund;
1235         if(namefile != ""){     delete nameMap; }
1236         else { delete ct; }
1237
1238         m->mothurRemove(thisDistFile);
1239         m->mothurRemove(thisNamefile);
1240
1241         return listFileName;
1242
1243         }
1244         catch(exception& e) {
1245                 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1246                 exit(1);
1247         }
1248 }
1249
1250 //**********************************************************************************************************************
1251 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1252         try {
1253         string listFileName = "";
1254
1255         Cluster* cluster = NULL;
1256         SparseDistanceMatrix* matrix = NULL;
1257         ListVector* list = NULL;
1258         ListVector oldList;
1259         RAbundVector* rabund = NULL;
1260
1261         if (m->control_pressed) { return listFileName; }
1262
1263 #ifdef USE_MPI
1264         int pid;
1265         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1266
1267         //output your files too
1268         if (pid != 0) {
1269             cout << endl << "Reading " << thisDistFile << endl;
1270         }
1271 #endif
1272
1273         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1274
1275         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1276         read->setCutoff(cutoff);
1277
1278         NameAssignment* nameMap = NULL;
1279         CountTable* ct = NULL;
1280                 if(namefile != ""){
1281                         nameMap = new NameAssignment(thisNamefile);
1282                         nameMap->readMap();
1283             read->read(nameMap);
1284                 }else if (countfile != "") {
1285             ct = new CountTable();
1286             ct->readTable(thisNamefile);
1287             read->read(ct);
1288         }
1289
1290                 list = read->getListVector();
1291         oldList = *list;
1292                 matrix = read->getDMatrix();
1293
1294                 if(countfile != "") {
1295             rabund = new RAbundVector();
1296             createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1297             delete ct;
1298         }else { rabund = new RAbundVector(list->getRAbundVector()); }
1299
1300         delete read;  read = NULL;
1301         if (namefile != "") { delete nameMap; nameMap = NULL; }
1302
1303
1304 #ifdef USE_MPI
1305         //output your files too
1306         if (pid != 0) {
1307             cout << endl << "Clustering " << thisDistFile << endl;
1308         }
1309 #endif
1310
1311         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1312
1313         //create cluster
1314         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1315         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1316         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
1317         tag = cluster->getTag();
1318
1319         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1320         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1321
1322         ofstream listFile;
1323         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1324
1325         listFileName = fileroot+ tag + ".list";
1326
1327         float previousDist = 0.00000;
1328         float rndPreviousDist = 0.00000;
1329
1330         oldList = *list;
1331
1332         print_start = true;
1333         start = time(NULL);
1334         double saveCutoff = cutoff;
1335
1336         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1337
1338             if (m->control_pressed) { //clean up
1339                 delete matrix; delete list;     delete cluster; delete rabund;
1340                 listFile.close();
1341                 m->mothurRemove(listFileName);
1342                 return listFileName;
1343             }
1344
1345             cluster->update(saveCutoff);
1346
1347             float dist = matrix->getSmallDist();
1348             float rndDist;
1349             if (hard) {
1350                 rndDist = m->ceilDist(dist, precision);
1351             }else{
1352                 rndDist = m->roundDist(dist, precision);
1353             }
1354
1355             if(previousDist <= 0.0000 && dist != previousDist){
1356                 oldList.setLabel("unique");
1357                 oldList.print(listFile);
1358                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1359             }
1360             else if(rndDist != rndPreviousDist){
1361                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1362                 oldList.print(listFile);
1363                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1364             }
1365
1366             previousDist = dist;
1367             rndPreviousDist = rndDist;
1368             oldList = *list;
1369         }
1370
1371
1372         if(previousDist <= 0.0000){
1373             oldList.setLabel("unique");
1374             oldList.print(listFile);
1375             if (labels.count("unique") == 0) { labels.insert("unique"); }
1376         }
1377         else if(rndPreviousDist<cutoff){
1378             oldList.setLabel(toString(rndPreviousDist,  length-1));
1379             oldList.print(listFile);
1380             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1381         }
1382
1383         delete matrix; delete list;     delete cluster; delete rabund;
1384         matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1385         listFile.close();
1386
1387         if (m->control_pressed) { //clean up
1388             m->mothurRemove(listFileName);
1389             return listFileName;
1390         }
1391
1392         m->mothurRemove(thisDistFile);
1393         m->mothurRemove(thisNamefile);
1394
1395         if (saveCutoff != cutoff) {
1396             if (hard)   {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1397             else                {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1398
1399             m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1400         }
1401
1402         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1403
1404         return listFileName;
1405
1406         }
1407         catch(exception& e) {
1408                 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1409                 exit(1);
1410         }
1411 }
1412 //**********************************************************************************************************************
1413
1414 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1415         try{
1416
1417 #ifdef USE_MPI
1418                 int pid;
1419                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1420
1421                 if (pid != 0) {
1422 #endif
1423
1424                 string thisOutputDir = outputDir;
1425                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1426                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("column");
1427                 m->mothurRemove(outputFileName);
1428
1429
1430                 for (int i = 0; i < distNames.size(); i++) {
1431                         if (m->control_pressed) {  return 0; }
1432
1433                         string thisDistFile = distNames[i].begin()->first;
1434
1435                         m->appendFiles(thisDistFile, outputFileName);
1436                 }
1437
1438                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1439
1440 #ifdef USE_MPI
1441                 }
1442 #endif
1443
1444                 return 0;
1445
1446
1447         }
1448         catch(exception& e) {
1449                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1450                 exit(1);
1451         }
1452 }
1453 //**********************************************************************************************************************
1454 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1455     try {
1456         rabund->setLabel(list->getLabel());
1457         for(int i = 0; i < list->getNumBins(); i++) {
1458             if (m->control_pressed) { break; }
1459             vector<string> binNames;
1460             string bin = list->get(i);
1461             m->splitAtComma(bin, binNames);
1462             int total = 0;
1463             for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
1464             rabund->push_back(total);
1465         }
1466         return 0;
1467     }
1468     catch(exception& e) {
1469                 m->errorOut(e, "ClusterCommand", "createRabund");
1470                 exit(1);
1471         }
1472
1473 }
1474 //**********************************************************************************************************************