clustersplitcommand.cpp

   1 /*
   2  *  clustersplitcommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 5/19/10.
   6  *  Copyright 2010 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "clustersplitcommand.h"
  11
  12
  13 //**********************************************************************************************************************
  14 vector<string> ClusterSplitCommand::setParameters(){
  15         try {
  16                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
  17                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
  18                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
  19                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
  20         CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
  21                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
  22                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
  23                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "","",false,false,true); parameters.push_back(psplitmethod);
  24                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
  25                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
  26         CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
  27                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
  28                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
  29                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff);
  30                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
  31                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false); parameters.push_back(pmethod);
  32                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
  33         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
  34                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
  35                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
  36
  37                 vector<string> myArray;
  38                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
  39                 return myArray;
  40         }
  41         catch(exception& e) {
  42                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
  43                 exit(1);
  44         }
  45 }
  46 //**********************************************************************************************************************
  47 string ClusterSplitCommand::getHelpString(){
  48         try {
  49                 string helpString = "";
  50                 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
  51                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
  52                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
  53                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
  54                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
  55                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
  56                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
  57                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
  58                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
  59                 helpString += "The name parameter allows you to enter your name file. \n";
  60         helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
  61         helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
  62                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
  63                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
  64                 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
  65                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
  66                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
  67                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
  68                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
  69         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic.  It is only valid with splitmethod=fasta. Default=f.\n";
  70 #ifdef USE_MPI
  71                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
  72 #endif
  73                 helpString += "The cluster.split command should be in the following format: \n";
  74                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
  75                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
  76                 return helpString;
  77         }
  78         catch(exception& e) {
  79                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
  80                 exit(1);
  81         }
  82 }
  83 //**********************************************************************************************************************
  84 string ClusterSplitCommand::getOutputPattern(string type) {
  85     try {
  86         string pattern = "";
  87
  88         if (type == "list") {  pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
  89         else if (type == "rabund") {  pattern = "[filename],[clustertag],rabund"; }
  90         else if (type == "sabund") {  pattern = "[filename],[clustertag],sabund"; }
  91         else if (type == "column") {  pattern = "[filename],dist"; }
  92         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
  93
  94         return pattern;
  95     }
  96     catch(exception& e) {
  97         m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
  98         exit(1);
  99     }
 100 }
 101 //**********************************************************************************************************************
 102 ClusterSplitCommand::ClusterSplitCommand(){
 103         try {
 104                 abort = true; calledHelp = true;
 105                 setParameters();
 106                 vector<string> tempOutNames;
 107                 outputTypes["list"] = tempOutNames;
 108                 outputTypes["rabund"] = tempOutNames;
 109                 outputTypes["sabund"] = tempOutNames;
 110                 outputTypes["column"] = tempOutNames;
 111         }
 112         catch(exception& e) {
 113                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 114                 exit(1);
 115         }
 116 }
 117 //**********************************************************************************************************************
 118 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
 119 ClusterSplitCommand::ClusterSplitCommand(string option)  {
 120         try{
 121                 abort = false; calledHelp = false;
 122                 format = "";
 123
 124                 //allow user to run help
 125                 if(option == "help") { help(); abort = true; calledHelp = true; }
 126                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
 127
 128                 else {
 129                         vector<string> myArray = setParameters();
 130
 131                         OptionParser parser(option);
 132                         map<string,string> parameters = parser.getParameters();
 133
 134                         ValidParameters validParameter("cluster.split");
 135
 136                         //check to make sure all parameters are valid for command
 137                         map<string,string>::iterator it;
 138                         for (it = parameters.begin(); it != parameters.end(); it++) {
 139                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
 140                                         abort = true;
 141                                 }
 142                         }
 143
 144                         //initialize outputTypes
 145                         vector<string> tempOutNames;
 146                         outputTypes["list"] = tempOutNames;
 147                         outputTypes["rabund"] = tempOutNames;
 148                         outputTypes["sabund"] = tempOutNames;
 149                         outputTypes["column"] = tempOutNames;
 150
 151                         //if the user changes the output directory command factory will send this info to us in the output parameter
 152                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
 153
 154                                 //if the user changes the input directory command factory will send this info to us in the output parameter
 155                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
 156                         if (inputDir == "not found"){   inputDir = "";          }
 157                         else {
 158                                 string path;
 159                                 it = parameters.find("phylip");
 160                                 //user has given a template file
 161                                 if(it != parameters.end()){
 162                                         path = m->hasPath(it->second);
 163                                         //if the user has not given a path then, add inputdir. else leave path alone.
 164                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
 165                                 }
 166
 167                                 it = parameters.find("column");
 168                                 //user has given a template file
 169                                 if(it != parameters.end()){
 170                                         path = m->hasPath(it->second);
 171                                         //if the user has not given a path then, add inputdir. else leave path alone.
 172                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
 173                                 }
 174
 175                                 it = parameters.find("name");
 176                                 //user has given a template file
 177                                 if(it != parameters.end()){
 178                                         path = m->hasPath(it->second);
 179                                         //if the user has not given a path then, add inputdir. else leave path alone.
 180                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
 181                                 }
 182
 183                                 it = parameters.find("taxonomy");
 184                                 //user has given a template file
 185                                 if(it != parameters.end()){
 186                                         path = m->hasPath(it->second);
 187                                         //if the user has not given a path then, add inputdir. else leave path alone.
 188                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
 189                                 }
 190
 191                                 it = parameters.find("fasta");
 192                                 //user has given a template file
 193                                 if(it != parameters.end()){
 194                                         path = m->hasPath(it->second);
 195                                         //if the user has not given a path then, add inputdir. else leave path alone.
 196                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
 197                                 }
 198
 199                 it = parameters.find("count");
 200                                 //user has given a template file
 201                                 if(it != parameters.end()){
 202                                         path = m->hasPath(it->second);
 203                                         //if the user has not given a path then, add inputdir. else leave path alone.
 204                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
 205                                 }
 206                         }
 207
 208                         //check for required parameters
 209                         phylipfile = validParameter.validFile(parameters, "phylip", true);
 210                         if (phylipfile == "not open") { abort = true; }
 211                         else if (phylipfile == "not found") { phylipfile = ""; }
 212                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
 213
 214                         columnfile = validParameter.validFile(parameters, "column", true);
 215                         if (columnfile == "not open") { abort = true; }
 216                         else if (columnfile == "not found") { columnfile = ""; }
 217                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
 218
 219                         namefile = validParameter.validFile(parameters, "name", true);
 220                         if (namefile == "not open") { abort = true; namefile = "";}
 221                         else if (namefile == "not found") { namefile = "";  }
 222                         else { m->setNameFile(namefile); }
 223
 224             countfile = validParameter.validFile(parameters, "count", true);
 225                         if (countfile == "not open") { abort = true; countfile = "";}
 226                         else if (countfile == "not found") { countfile = "";  }
 227                         else { m->setCountTableFile(countfile); }
 228
 229                         fastafile = validParameter.validFile(parameters, "fasta", true);
 230                         if (fastafile == "not open") { abort = true; }
 231                         else if (fastafile == "not found") { fastafile = ""; }
 232                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
 233
 234                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
 235                         if (taxFile == "not open") { taxFile = ""; abort = true; }
 236                         else if (taxFile == "not found") { taxFile = ""; }
 237                         else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
 238
 239                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
 240                                 //is there are current file available for either of these?
 241                                 //give priority to column, then phylip, then fasta
 242                                 columnfile = m->getColumnFile();
 243                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
 244                                 else {
 245                                         phylipfile = m->getPhylipFile();
 246                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
 247                                         else {
 248                                                 fastafile = m->getFastaFile();
 249                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
 250                                                 else {
 251                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
 252                                                         abort = true;
 253                                                 }
 254                                         }
 255                                 }
 256                         }
 257                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
 258
 259             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
 260
 261                         if (columnfile != "") {
 262                                 if ((namefile == "") && (countfile == "")) {
 263                                         namefile = m->getNameFile();
 264                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 265                                         else {
 266                                                 countfile = m->getCountTableFile();
 267                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
 268                         else {
 269                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
 270                             abort = true;
 271                         }
 272                                         }
 273                                 }
 274                         }
 275
 276                         if (fastafile != "") {
 277                                 if (taxFile == "") {
 278                                         taxFile = m->getTaxonomyFile();
 279                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
 280                                         else {
 281                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
 282                                                 abort = true;
 283                                         }
 284                                 }
 285
 286                                 if ((namefile == "") && (countfile == "")) {
 287                                         namefile = m->getNameFile();
 288                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 289                                         else {
 290                                                 countfile = m->getCountTableFile();
 291                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
 292                         else {
 293                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine();
 294                             abort = true;
 295                         }
 296                                         }
 297                                 }
 298                         }
 299
 300                         //check for optional parameter and set defaults
 301                         // ...at some point should added some additional type checking...
 302                         //get user cutoff and precision or use defaults
 303                         string temp;
 304                         temp = validParameter.validFile(parameters, "precision", false);
 305                         if (temp == "not found") { temp = "100"; }
 306                         //saves precision legnth for formatting below
 307                         length = temp.length();
 308                         m->mothurConvert(temp, precision);
 309
 310                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
 311                         hard = m->isTrue(temp);
 312
 313                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
 314                         large = m->isTrue(temp);
 315
 316                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
 317                         m->setProcessors(temp);
 318                         m->mothurConvert(temp, processors);
 319
 320                         temp = validParameter.validFile(parameters, "splitmethod", false);
 321                         if ((splitmethod != "fasta") && (splitmethod != "classify")) {
 322                                 if (temp == "not found")  { splitmethod = "distance"; }
 323                                 else {  splitmethod = temp; }
 324                         }
 325
 326             temp = validParameter.validFile(parameters, "classic", false);                      if (temp == "not found") { temp = "F"; }
 327                         classic = m->isTrue(temp);
 328
 329             if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
 330
 331                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
 332                         m->mothurConvert(temp, cutoff);
 333                         cutoff += (5 / (precision * 10.0));
 334
 335                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
 336                         m->mothurConvert(temp, taxLevelCutoff);
 337
 338                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
 339
 340                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
 341                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 342
 343                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
 344                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
 345
 346                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
 347
 348                         showabund = validParameter.validFile(parameters, "showabund", false);
 349                         if (showabund == "not found") { showabund = "T"; }
 350
 351             temp = validParameter.validFile(parameters, "cluster", false);  if (temp == "not found") { temp = "T"; }
 352             runCluster = m->isTrue(temp);
 353
 354                         timing = validParameter.validFile(parameters, "timing", false);
 355                         if (timing == "not found") { timing = "F"; }
 356
 357                 }
 358         }
 359         catch(exception& e) {
 360                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 361                 exit(1);
 362         }
 363 }
 364
 365 //**********************************************************************************************************************
 366
 367 int ClusterSplitCommand::execute(){
 368         try {
 369
 370                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
 371
 372                 time_t estart;
 373                 vector<string> listFileNames;
 374                 set<string> labels;
 375                 string singletonName = "";
 376                 double saveCutoff = cutoff;
 377
 378                 //****************** file prep work ******************************//
 379                 #ifdef USE_MPI
 380                         int pid;
 381                         int tag = 2001;
 382                         MPI_Status status;
 383                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
 384                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 385
 386                         if (pid == 0) { //only process 0 converts and splits
 387
 388                 #endif
 389
 390                 //if user gave a phylip file convert to column file
 391                 if (format == "phylip") {
 392                         estart = time(NULL);
 393                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
 394
 395                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
 396
 397                         NameAssignment* nameMap = NULL;
 398                         convert->setFormat("phylip");
 399                         convert->read(nameMap);
 400
 401                         if (m->control_pressed) {  delete convert;  return 0;  }
 402
 403                         distfile = convert->getOutputFile();
 404
 405                         //if no names file given with phylip file, create it
 406                         ListVector* listToMakeNameFile =  convert->getListVector();
 407                         if ((namefile == "") && (countfile == "")) {  //you need to make a namefile for split matrix
 408                                 ofstream out;
 409                                 namefile = phylipfile + ".names";
 410                                 m->openOutputFile(namefile, out);
 411                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
 412                                         string bin = listToMakeNameFile->get(i);
 413                                         out << bin << '\t' << bin << endl;
 414                                 }
 415                                 out.close();
 416                         }
 417                         delete listToMakeNameFile;
 418                         delete convert;
 419
 420                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
 421                 }
 422                 if (m->control_pressed) { return 0; }
 423
 424                 estart = time(NULL);
 425                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
 426
 427                 //split matrix into non-overlapping groups
 428                 SplitMatrix* split;
 429                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large);                                                    }
 430                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large);                                    }
 431                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);  }
 432                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
 433
 434                 split->split();
 435
 436                 if (m->control_pressed) { delete split; return 0; }
 437
 438                 singletonName = split->getSingletonNames();
 439                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
 440                 delete split;
 441
 442         if (m->debug) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); }
 443
 444                 //output a merged distance file
 445                 if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
 446
 447
 448                 if (m->control_pressed) { return 0; }
 449
 450                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
 451                 estart = time(NULL);
 452
 453         if (!runCluster) {
 454
 455                 m->mothurOutEndLine();
 456                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 457                 for (int i = 0; i < distName.size(); i++) {     m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine();      }
 458                 m->mothurOutEndLine();
 459                 return 0;
 460
 461         }
 462
 463                 //****************** break up files between processes and cluster each file set ******************************//
 464         #ifdef USE_MPI
 465                         ////you are process 0 from above////
 466
 467                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 468                         dividedNames.resize(processors);
 469
 470                         //for each file group figure out which process will complete it
 471                         //want to divide the load intelligently so the big files are spread between processes
 472                         for (int i = 0; i < distName.size(); i++) {
 473                                 int processToAssign = (i+1) % processors;
 474                                 if (processToAssign == 0) { processToAssign = processors; }
 475
 476                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 477                         }
 478
 479                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 480                         for (int i = 0; i < processors; i++) {
 481                                 int remainder = ((i+1) % processors);
 482                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 483                         }
 484
 485
 486                         //send each child the list of files it needs to process
 487                         for(int i = 1; i < processors; i++) {
 488                                 //send number of file pairs
 489                                 int num = dividedNames[i].size();
 490                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 491
 492                                 for (int j = 0; j < num; j++) { //send filenames to process i
 493                                         char tempDistFileName[1024];
 494                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
 495                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
 496
 497                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 498                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 499
 500                                         char tempNameFileName[1024];
 501                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
 502                                         int lengthName = (dividedNames[i][j].begin()->second).length();
 503
 504                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 505                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 506                                 }
 507                         }
 508
 509                         //process your share
 510                         listFileNames = cluster(dividedNames[0], labels);
 511
 512                         //receive the other processes info
 513                         for(int i = 1; i < processors; i++) {
 514                                 int num = dividedNames[i].size();
 515
 516                                 double tempCutoff;
 517                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
 518                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
 519
 520                                 //send list filenames to root process
 521                                 for (int j = 0; j < num; j++) {
 522                                         int lengthList = 0;
 523                                         char tempListFileName[1024];
 524
 525                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 526                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 527
 528                                         string myListFileName = tempListFileName;
 529                                         myListFileName = myListFileName.substr(0, lengthList);
 530
 531                                         listFileNames.push_back(myListFileName);
 532                                 }
 533
 534                                 //send Labels to root process
 535                                 int numLabels = 0;
 536                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 537
 538                                 for (int j = 0; j < numLabels; j++) {
 539                                         int lengthLabel = 0;
 540                                         char tempLabel[100];
 541
 542                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 543                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 544
 545                                         string myLabel = tempLabel;
 546                                         myLabel = myLabel.substr(0, lengthLabel);
 547
 548                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
 549                                 }
 550                         }
 551
 552                 }else { //you are a child process
 553                         vector < map<string, string> >  myNames;
 554
 555                         //recieve the files you need to process
 556                         //receive number of file pairs
 557                         int num = 0;
 558                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 559
 560                         myNames.resize(num);
 561
 562                         for (int j = 0; j < num; j++) { //receive filenames to process
 563                                 int lengthDist = 0;
 564                                 char tempDistFileName[1024];
 565
 566                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 567                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 568
 569                                 string myDistFileName = tempDistFileName;
 570                                 myDistFileName = myDistFileName.substr(0, lengthDist);
 571
 572                                 int lengthName = 0;
 573                                 char tempNameFileName[1024];
 574
 575                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 576                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 577
 578                                 string myNameFileName = tempNameFileName;
 579                                 myNameFileName = myNameFileName.substr(0, lengthName);
 580
 581                                 //save file name
 582                                 myNames[j][myDistFileName] = myNameFileName;
 583                         }
 584
 585                         //process them
 586                         listFileNames = cluster(myNames, labels);
 587
 588                         //send cutoff
 589                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
 590
 591                         //send list filenames to root process
 592                         for (int j = 0; j < num; j++) {
 593                                 char tempListFileName[1024];
 594                                 strcpy(tempListFileName, listFileNames[j].c_str());
 595                                 int lengthList = listFileNames[j].length();
 596
 597                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 598                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 599                         }
 600
 601                         //send Labels to root process
 602                         int numLabels = labels.size();
 603                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 604
 605                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
 606                                 char tempLabel[100];
 607                                 strcpy(tempLabel, (*it).c_str());
 608                                 int lengthLabel = (*it).length();
 609
 610                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 611                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 612                         }
 613                 }
 614
 615                 //make everyone wait
 616                 MPI_Barrier(MPI_COMM_WORLD);
 617
 618         #else
 619                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
 620                 //sanity check
 621                 if (processors > distName.size()) { processors = distName.size(); }
 622
 623                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 624                                 if(processors == 1){
 625                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 626                                 }else{
 627                                         listFileNames = createProcesses(distName, labels);
 628                 }
 629                 #else
 630                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 631                 #endif
 632         #endif
 633                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
 634
 635                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
 636
 637                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
 638
 639                 //****************** merge list file and create rabund and sabund files ******************************//
 640                 estart = time(NULL);
 641                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
 642
 643                 #ifdef USE_MPI
 644                         if (pid == 0) { //only process 0 merges
 645                 #endif
 646
 647                 ListVector* listSingle;
 648                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 649
 650                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 651
 652                 mergeLists(listFileNames, labelBins, listSingle);
 653
 654                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 655
 656                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 657
 658                 //set list file as new current listfile
 659                 string current = "";
 660                 itTypes = outputTypes.find("list");
 661                 if (itTypes != outputTypes.end()) {
 662                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
 663                 }
 664
 665                 //set rabund file as new current rabundfile
 666                 itTypes = outputTypes.find("rabund");
 667                 if (itTypes != outputTypes.end()) {
 668                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
 669                 }
 670
 671                 //set sabund file as new current sabundfile
 672                 itTypes = outputTypes.find("sabund");
 673                 if (itTypes != outputTypes.end()) {
 674                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
 675                 }
 676
 677                 m->mothurOutEndLine();
 678                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 679                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
 680                 m->mothurOutEndLine();
 681
 682                 #ifdef USE_MPI
 683                         } //only process 0 merges
 684
 685                         //make everyone wait
 686                         MPI_Barrier(MPI_COMM_WORLD);
 687                 #endif
 688
 689                 return 0;
 690         }
 691         catch(exception& e) {
 692                 m->errorOut(e, "ClusterSplitCommand", "execute");
 693                 exit(1);
 694         }
 695 }
 696 //**********************************************************************************************************************
 697 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
 698         try {
 699
 700                 map<float, int> labelBin;
 701                 vector<float> orderFloat;
 702                 int numSingleBins;
 703
 704                 //read in singletons
 705                 if (singleton != "none") {
 706
 707             ifstream in;
 708             m->openInputFile(singleton, in);
 709
 710                         string firstCol, secondCol;
 711                         listSingle = new ListVector();
 712
 713             if (countfile != "") { m->getline(in); m->gobble(in); }
 714
 715                         while (!in.eof()) {
 716                                 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
 717                                 if (countfile == "") { listSingle->push_back(secondCol); }
 718                 else { listSingle->push_back(firstCol); }
 719                         }
 720
 721                         in.close();
 722                         m->mothurRemove(singleton);
 723
 724                         numSingleBins = listSingle->getNumBins();
 725                 }else{  listSingle = NULL; numSingleBins = 0;  }
 726
 727                 //go through users set and make them floats so we can sort them
 728                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
 729                         float temp = -10.0;
 730
 731                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
 732                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
 733
 734                         if (temp <= cutoff) {
 735                                 orderFloat.push_back(temp);
 736                                 labelBin[temp] = numSingleBins; //initialize numbins
 737                         }
 738                 }
 739
 740                 //sort order
 741                 sort(orderFloat.begin(), orderFloat.end());
 742                 userLabels.clear();
 743
 744                 //get the list info from each file
 745                 for (int k = 0; k < listNames.size(); k++) {
 746
 747                         if (m->control_pressed) {
 748                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
 749                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
 750                                 return labelBin;
 751                         }
 752
 753                         InputData* input = new InputData(listNames[k], "list");
 754                         ListVector* list = input->getListVector();
 755                         string lastLabel = list->getLabel();
 756
 757                         string filledInList = listNames[k] + "filledInTemp";
 758                         ofstream outFilled;
 759                         m->openOutputFile(filledInList, outFilled);
 760
 761                         //for each label needed
 762                         for(int l = 0; l < orderFloat.size(); l++){
 763
 764                                 string thisLabel;
 765                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
 766                                 else { thisLabel = toString(orderFloat[l],  length-1);  }
 767
 768                                 //this file has reached the end
 769                                 if (list == NULL) {
 770                                         list = input->getListVector(lastLabel, true);
 771                                 }else{  //do you have the distance, or do you need to fill in
 772
 773                                         float labelFloat;
 774                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
 775                                         else { convert(list->getLabel(), labelFloat); }
 776
 777                                         //check for missing labels
 778                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
 779                                                 //if its bigger get last label, otherwise keep it
 780                                                 delete list;
 781                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
 782                                         }
 783                                         lastLabel = list->getLabel();
 784                                 }
 785
 786                                 //print to new file
 787                                 list->setLabel(thisLabel);
 788                                 list->print(outFilled);
 789
 790                                 //update labelBin
 791                                 labelBin[orderFloat[l]] += list->getNumBins();
 792
 793                                 delete list;
 794
 795                                 list = input->getListVector();
 796                         }
 797
 798                         if (list != NULL) { delete list; }
 799                         delete input;
 800
 801                         outFilled.close();
 802                         m->mothurRemove(listNames[k]);
 803                         rename(filledInList.c_str(), listNames[k].c_str());
 804                 }
 805
 806                 return labelBin;
 807         }
 808         catch(exception& e) {
 809                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
 810                 exit(1);
 811         }
 812 }
 813 //**********************************************************************************************************************
 814 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
 815         try {
 816                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
 817                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
 818
 819         map<string, string> variables;
 820         variables["[filename]"] = fileroot;
 821         variables["[clustertag]"] = tag;
 822         string sabundFileName = getOutputFileName("sabund", variables);
 823         string rabundFileName = getOutputFileName("rabund", variables);
 824         if (countfile != "") { variables["[tag2]"] = "unique_list"; }
 825         string listFileName = getOutputFileName("list", variables);
 826
 827         if (countfile == "") {
 828             m->openOutputFile(sabundFileName,   outSabund);
 829             m->openOutputFile(rabundFileName,   outRabund);
 830             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
 831             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
 832
 833         }
 834                 m->openOutputFile(listFileName, outList);
 835         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
 836
 837
 838                 map<float, int>::iterator itLabel;
 839
 840                 //for each label needed
 841                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
 842
 843                         string thisLabel;
 844                         if (itLabel->first == -1) { thisLabel = "unique"; }
 845                         else { thisLabel = toString(itLabel->first,  length-1);  }
 846
 847                         outList << thisLabel << '\t' << itLabel->second << '\t';
 848
 849             RAbundVector* rabund = NULL;
 850             if (countfile == "") {
 851                 rabund = new RAbundVector();
 852                 rabund->setLabel(thisLabel);
 853             }
 854
 855                         //add in singletons
 856                         if (listSingle != NULL) {
 857                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
 858                                         outList << listSingle->get(j) << '\t';
 859                                         if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
 860                                 }
 861                         }
 862
 863                         //get the list info from each file
 864                         for (int k = 0; k < listNames.size(); k++) {
 865
 866                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } if (rabund != NULL) { delete rabund; } return 0; }
 867
 868                                 InputData* input = new InputData(listNames[k], "list");
 869                                 ListVector* list = input->getListVector(thisLabel);
 870
 871                                 //this file has reached the end
 872                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }
 873                                 else {
 874                                         for (int j = 0; j < list->getNumBins(); j++) {
 875                                                 outList << list->get(j) << '\t';
 876                                                 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
 877                                         }
 878                                         delete list;
 879                                 }
 880                                 delete input;
 881                         }
 882
 883             if (countfile == "") {
 884                 SAbundVector sabund = rabund->getSAbundVector();
 885                 sabund.print(outSabund);
 886                 rabund->print(outRabund);
 887             }
 888                         outList << endl;
 889
 890                         if (rabund != NULL) { delete rabund; }
 891                 }
 892
 893                 outList.close();
 894         if (countfile == "") {
 895             outRabund.close();
 896             outSabund.close();
 897                 }
 898                 if (listSingle != NULL) { delete listSingle;  }
 899
 900                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
 901
 902                 return 0;
 903         }
 904         catch(exception& e) {
 905                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
 906                 exit(1);
 907         }
 908 }
 909
 910 //**********************************************************************************************************************
 911
 912 void ClusterSplitCommand::printData(ListVector* oldList){
 913         try {
 914                 string label = oldList->getLabel();
 915                 RAbundVector oldRAbund = oldList->getRAbundVector();
 916
 917                 oldRAbund.setLabel(label);
 918                 if (m->isTrue(showabund)) {
 919                         oldRAbund.getSAbundVector().print(cout);
 920                 }
 921                 oldRAbund.print(outRabund);
 922                 oldRAbund.getSAbundVector().print(outSabund);
 923
 924                 oldList->print(outList);
 925         }
 926         catch(exception& e) {
 927                 m->errorOut(e, "ClusterSplitCommand", "printData");
 928                 exit(1);
 929         }
 930 }
 931 //**********************************************************************************************************************
 932 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
 933         try {
 934
 935         vector<string> listFiles;
 936         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 937         dividedNames.resize(processors);
 938
 939         //for each file group figure out which process will complete it
 940         //want to divide the load intelligently so the big files are spread between processes
 941         for (int i = 0; i < distName.size(); i++) {
 942             //cout << i << endl;
 943             int processToAssign = (i+1) % processors;
 944             if (processToAssign == 0) { processToAssign = processors; }
 945
 946             dividedNames[(processToAssign-1)].push_back(distName[i]);
 947             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
 948         }
 949
 950         //now lets reverse the order of ever other process, so we balance big files running with little ones
 951         for (int i = 0; i < processors; i++) {
 952             //cout << i << endl;
 953             int remainder = ((i+1) % processors);
 954             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 955         }
 956
 957         if (m->control_pressed) { return listFiles; }
 958
 959         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 960                 int process = 1;
 961                 processIDS.clear();
 962
 963                 //loop through and create all the processes you want
 964                 while (process != processors) {
 965                         int pid = fork();
 966
 967                         if (pid > 0) {
 968                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 969                                 process++;
 970                         }else if (pid == 0){
 971                                 set<string> labels;
 972                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
 973
 974                                 //write out names to file
 975                                 string filename = toString(getpid()) + ".temp";
 976                                 ofstream out;
 977                                 m->openOutputFile(filename, out);
 978                                 out << tag << endl;
 979                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
 980                                 out.close();
 981
 982                                 //print out labels
 983                                 ofstream outLabels;
 984                                 filename = toString(getpid()) + ".temp.labels";
 985                                 m->openOutputFile(filename, outLabels);
 986
 987                                 outLabels << cutoff << endl;
 988                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
 989                                         outLabels << (*it) << endl;
 990                                 }
 991                                 outLabels.close();
 992
 993                                 exit(0);
 994                         }else {
 995                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
 996                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
 997                                 exit(0);
 998                         }
 999                 }
1000
1001         //do your part
1002         listFiles = cluster(dividedNames[0], labels);
1003
1004                 //force parent to wait until all the processes are done
1005                 for (int i=0;i< processIDS.size();i++) {
1006                         int temp = processIDS[i];
1007                         wait(&temp);
1008                 }
1009
1010         //get list of list file names from each process
1011         for(int i=0;i<processIDS.size();i++){
1012             string filename = toString(processIDS[i]) + ".temp";
1013             ifstream in;
1014             m->openInputFile(filename, in);
1015
1016             in >> tag; m->gobble(in);
1017
1018             while(!in.eof()) {
1019                 string tempName;
1020                 in >> tempName; m->gobble(in);
1021                 listFiles.push_back(tempName);
1022             }
1023             in.close();
1024             m->mothurRemove((toString(processIDS[i]) + ".temp"));
1025
1026             //get labels
1027             filename = toString(processIDS[i]) + ".temp.labels";
1028             ifstream in2;
1029             m->openInputFile(filename, in2);
1030
1031             float tempCutoff;
1032             in2 >> tempCutoff; m->gobble(in2);
1033             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1034
1035             while(!in2.eof()) {
1036                 string tempName;
1037                 in2 >> tempName; m->gobble(in2);
1038                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1039             }
1040             in2.close();
1041             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1042         }
1043
1044
1045     #else
1046
1047         //////////////////////////////////////////////////////////////////////////////////////////////////////
1048                 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
1049                 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1050                 //Taking advantage of shared memory to allow both threads to add labels.
1051                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1052                 /*
1053                 vector<clusterData*> pDataArray;
1054                 DWORD   dwThreadIdArray[processors-1];
1055                 HANDLE  hThreadArray[processors-1];
1056
1057                 //Create processor worker threads.
1058                 for( int i=1; i<processors; i++ ){
1059                         // Allocate memory for thread data.
1060                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1061                         pDataArray.push_back(tempCluster);
1062                         processIDS.push_back(i);
1063
1064                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1065                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1066                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
1067
1068                 }
1069
1070         //do your part
1071         listFiles = cluster(dividedNames[0], labels);
1072
1073                 //Wait until all threads have terminated.
1074                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1075
1076                 //Close all thread handles and free memory allocations.
1077                 for(int i=0; i < pDataArray.size(); i++){
1078             //get tag
1079             tag = pDataArray[i]->tag;
1080             //get listfiles created
1081             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1082             //get labels
1083             set<string>::iterator it;
1084             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1085                         //check cutoff
1086             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1087                         CloseHandle(hThreadArray[i]);
1088                         delete pDataArray[i];
1089                 }
1090 */
1091         #endif
1092
1093         return listFiles;
1094
1095         }
1096         catch(exception& e) {
1097                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1098                 exit(1);
1099         }
1100 }
1101 //**********************************************************************************************************************
1102
1103 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1104         try {
1105
1106                 vector<string> listFileNames;
1107                 double smallestCutoff = cutoff;
1108
1109                 //cluster each distance file
1110                 for (int i = 0; i < distNames.size(); i++) {
1111
1112                         string thisNamefile = distNames[i].begin()->second;
1113                         string thisDistFile = distNames[i].begin()->first;
1114
1115                         string listFileName = "";
1116             if (classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff);   }
1117             else            {  listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff);          }
1118
1119                         if (m->control_pressed) { //clean up
1120                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1121                                 listFileNames.clear(); return listFileNames;
1122                         }
1123
1124             listFileNames.push_back(listFileName);
1125         }
1126
1127                 cutoff = smallestCutoff;
1128
1129                 return listFileNames;
1130
1131         }
1132         catch(exception& e) {
1133                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1134                 exit(1);
1135         }
1136
1137
1138 }
1139 //**********************************************************************************************************************
1140 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1141         try {
1142         string listFileName = "";
1143
1144         ListVector* list = NULL;
1145         ListVector oldList;
1146         RAbundVector* rabund = NULL;
1147
1148 #ifdef USE_MPI
1149         int pid;
1150         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1151
1152         //output your files too
1153         if (pid != 0) {
1154             cout << endl << "Reading " << thisDistFile << endl;
1155         }
1156 #endif
1157
1158         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1159
1160         //reads phylip file storing data in 2D vector, also fills list and rabund
1161         bool sim = false;
1162                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1163
1164         NameAssignment* nameMap = NULL;
1165         CountTable* ct = NULL;
1166         if(namefile != ""){
1167                         nameMap = new NameAssignment(thisNamefile);
1168                         nameMap->readMap();
1169             cluster->readPhylipFile(thisDistFile, nameMap);
1170                 }else if (countfile != "") {
1171             ct = new CountTable();
1172             ct->readTable(thisNamefile);
1173             cluster->readPhylipFile(thisDistFile, ct);
1174         }
1175         tag = cluster->getTag();
1176
1177                 if (m->control_pressed) { if(namefile != ""){   delete nameMap; }
1178             else { delete ct; } delete cluster; return 0; }
1179
1180                 list = cluster->getListVector();
1181                 rabund = cluster->getRAbundVector();
1182
1183                 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1184                 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1185         listFileName = fileroot+ tag + ".list";
1186
1187         ofstream listFile;
1188                 m->openOutputFile(fileroot+ tag + ".list",      listFile);
1189
1190                 float previousDist = 0.00000;
1191                 float rndPreviousDist = 0.00000;
1192                 oldList = *list;
1193
1194 #ifdef USE_MPI
1195         //output your files too
1196         if (pid != 0) {
1197             cout << endl << "Clustering " << thisDistFile << endl;
1198         }
1199 #endif
1200
1201         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1202
1203                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1204                         if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  if(namefile != ""){    delete nameMap; }
1205                 else { delete ct; } return listFileName;  }
1206
1207                         cluster->update(cutoff);
1208
1209                         float dist = cluster->getSmallDist();
1210                         float rndDist;
1211                         if (hard) {
1212                                 rndDist = m->ceilDist(dist, precision);
1213                         }else{
1214                                 rndDist = m->roundDist(dist, precision);
1215                         }
1216
1217             if(previousDist <= 0.0000 && dist != previousDist){
1218                 oldList.setLabel("unique");
1219                 oldList.print(listFile);
1220                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1221             }
1222             else if(rndDist != rndPreviousDist){
1223                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1224                 oldList.print(listFile);
1225                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1226             }
1227
1228
1229                         previousDist = dist;
1230                         rndPreviousDist = rndDist;
1231                         oldList = *list;
1232                 }
1233
1234                 if(previousDist <= 0.0000){
1235             oldList.setLabel("unique");
1236             oldList.print(listFile);
1237             if (labels.count("unique") == 0) { labels.insert("unique"); }
1238         }
1239         else if(rndPreviousDist<cutoff){
1240             oldList.setLabel(toString(rndPreviousDist,  length-1));
1241             oldList.print(listFile);
1242             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1243         }
1244
1245
1246                 listFile.close();
1247
1248                 delete cluster;  delete list; delete rabund;
1249         if(namefile != ""){     delete nameMap; }
1250         else { delete ct; }
1251
1252         m->mothurRemove(thisDistFile);
1253         m->mothurRemove(thisNamefile);
1254
1255         return listFileName;
1256
1257         }
1258         catch(exception& e) {
1259                 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1260                 exit(1);
1261         }
1262 }
1263
1264 //**********************************************************************************************************************
1265 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1266         try {
1267         string listFileName = "";
1268
1269         Cluster* cluster = NULL;
1270         SparseDistanceMatrix* matrix = NULL;
1271         ListVector* list = NULL;
1272         ListVector oldList;
1273         RAbundVector* rabund = NULL;
1274
1275         if (m->control_pressed) { return listFileName; }
1276
1277 #ifdef USE_MPI
1278         int pid;
1279         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1280
1281         //output your files too
1282         if (pid != 0) {
1283             cout << endl << "Reading " << thisDistFile << endl;
1284         }
1285 #endif
1286
1287         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1288
1289         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1290         read->setCutoff(cutoff);
1291
1292         NameAssignment* nameMap = NULL;
1293         CountTable* ct = NULL;
1294                 if(namefile != ""){
1295                         nameMap = new NameAssignment(thisNamefile);
1296                         nameMap->readMap();
1297             read->read(nameMap);
1298                 }else if (countfile != "") {
1299             ct = new CountTable();
1300             ct->readTable(thisNamefile);
1301             read->read(ct);
1302         }else { read->read(nameMap); }
1303
1304                 list = read->getListVector();
1305         oldList = *list;
1306                 matrix = read->getDMatrix();
1307
1308                 if(countfile != "") {
1309             rabund = new RAbundVector();
1310             createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1311             delete ct;
1312         }else { rabund = new RAbundVector(list->getRAbundVector()); }
1313
1314         delete read;  read = NULL;
1315         if (namefile != "") { delete nameMap; nameMap = NULL; }
1316
1317
1318 #ifdef USE_MPI
1319         //output your files too
1320         if (pid != 0) {
1321             cout << endl << "Clustering " << thisDistFile << endl;
1322         }
1323 #endif
1324
1325         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1326
1327         //create cluster
1328         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1329         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1330         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
1331         tag = cluster->getTag();
1332
1333         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1334         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1335
1336         ofstream listFile;
1337         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1338
1339         listFileName = fileroot+ tag + ".list";
1340
1341         float previousDist = 0.00000;
1342         float rndPreviousDist = 0.00000;
1343
1344         oldList = *list;
1345
1346         print_start = true;
1347         start = time(NULL);
1348         double saveCutoff = cutoff;
1349
1350         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1351
1352             if (m->control_pressed) { //clean up
1353                 delete matrix; delete list;     delete cluster; delete rabund;
1354                 listFile.close();
1355                 m->mothurRemove(listFileName);
1356                 return listFileName;
1357             }
1358
1359             cluster->update(saveCutoff);
1360
1361             float dist = matrix->getSmallDist();
1362             float rndDist;
1363             if (hard) {
1364                 rndDist = m->ceilDist(dist, precision);
1365             }else{
1366                 rndDist = m->roundDist(dist, precision);
1367             }
1368
1369             if(previousDist <= 0.0000 && dist != previousDist){
1370                 oldList.setLabel("unique");
1371                 oldList.print(listFile);
1372                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1373             }
1374             else if(rndDist != rndPreviousDist){
1375                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1376                 oldList.print(listFile);
1377                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1378             }
1379
1380             previousDist = dist;
1381             rndPreviousDist = rndDist;
1382             oldList = *list;
1383         }
1384
1385
1386         if(previousDist <= 0.0000){
1387             oldList.setLabel("unique");
1388             oldList.print(listFile);
1389             if (labels.count("unique") == 0) { labels.insert("unique"); }
1390         }
1391         else if(rndPreviousDist<cutoff){
1392             oldList.setLabel(toString(rndPreviousDist,  length-1));
1393             oldList.print(listFile);
1394             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1395         }
1396
1397         delete matrix; delete list;     delete cluster; delete rabund;
1398         matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1399         listFile.close();
1400
1401         if (m->control_pressed) { //clean up
1402             m->mothurRemove(listFileName);
1403             return listFileName;
1404         }
1405
1406         m->mothurRemove(thisDistFile);
1407         m->mothurRemove(thisNamefile);
1408
1409         if (saveCutoff != cutoff) {
1410             if (hard)   {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1411             else                {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1412
1413             m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1414         }
1415
1416         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1417
1418         return listFileName;
1419
1420         }
1421         catch(exception& e) {
1422                 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1423                 exit(1);
1424         }
1425 }
1426 //**********************************************************************************************************************
1427
1428 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1429         try{
1430
1431 #ifdef USE_MPI
1432                 int pid;
1433                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1434
1435                 if (pid != 0) {
1436 #endif
1437
1438                 string thisOutputDir = outputDir;
1439                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1440         map<string, string> variables;
1441         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
1442                 string outputFileName = getOutputFileName("column", variables);
1443                 m->mothurRemove(outputFileName);
1444
1445
1446                 for (int i = 0; i < distNames.size(); i++) {
1447                         if (m->control_pressed) {  return 0; }
1448
1449                         string thisDistFile = distNames[i].begin()->first;
1450
1451                         m->appendFiles(thisDistFile, outputFileName);
1452                 }
1453
1454                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1455
1456 #ifdef USE_MPI
1457                 }
1458 #endif
1459
1460                 return 0;
1461
1462
1463         }
1464         catch(exception& e) {
1465                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1466                 exit(1);
1467         }
1468 }
1469 //**********************************************************************************************************************
1470 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1471     try {
1472         rabund->setLabel(list->getLabel());
1473         for(int i = 0; i < list->getNumBins(); i++) {
1474             if (m->control_pressed) { break; }
1475             vector<string> binNames;
1476             string bin = list->get(i);
1477             m->splitAtComma(bin, binNames);
1478             int total = 0;
1479             for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
1480             rabund->push_back(total);
1481         }
1482         return 0;
1483     }
1484     catch(exception& e) {
1485                 m->errorOut(e, "ClusterCommand", "createRabund");
1486                 exit(1);
1487         }
1488
1489 }
1490 //**********************************************************************************************************************