clustersplitcommand.cpp

   1 /*
   2  *  clustersplitcommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 5/19/10.
   6  *  Copyright 2010 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "clustersplitcommand.h"
  11
  12
  13 //**********************************************************************************************************************
  14 vector<string> ClusterSplitCommand::setParameters(){
  15         try {
  16         CommandParameter pfile("file", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","",false,false,true); parameters.push_back(pfile);
  17                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
  18                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
  19                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
  20                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
  21         CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
  22                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
  23                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
  24                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "","",false,false,true); parameters.push_back(psplitmethod);
  25                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
  26                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
  27         CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
  28                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
  29                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
  30                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff);
  31                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
  32                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false); parameters.push_back(pmethod);
  33                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
  34         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
  35                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
  36                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
  37
  38                 vector<string> myArray;
  39                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
  40                 return myArray;
  41         }
  42         catch(exception& e) {
  43                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
  44                 exit(1);
  45         }
  46 }
  47 //**********************************************************************************************************************
  48 string ClusterSplitCommand::getHelpString(){
  49         try {
  50                 string helpString = "";
  51                 helpString += "The cluster.split command parameter options are file, fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
  52                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
  53                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
  54                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
  55                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
  56                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
  57                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
  58         helpString += "The file option allows you to enter your file containing your list of column and names/count files as well as the singleton file.  This file is mothur generated, when you run cluster.split() with the cluster=f parameter.  This can be helpful when you have a large dataset that you may be able to use all your processors for the splitting step, but have to reduce them for the cluster step due to RAM constraints. For example: cluster.split(fasta=yourFasta, taxonomy=yourTax, count=yourCount, taxlevel=3, cluster=f, processors=8) then cluster.split(file=yourFile, processors=4).  This allows your to maximize your processors during the splitting step.  Also, if you are unsure if the cluster step will have RAM issue with multiple processors, you can avoid running the first part of the command multiple times.\n";
  59                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
  60                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
  61                 helpString += "The name parameter allows you to enter your name file. \n";
  62         helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
  63         helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
  64                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
  65                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
  66                 helpString += "The method allows you to specify what clustering algorithm you want to use, default=average, option furthest, nearest, or average. \n";
  67                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
  68                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
  69                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
  70                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
  71         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic.  It is only valid with splitmethod=fasta. Default=f.\n";
  72 #ifdef USE_MPI
  73                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
  74 #endif
  75                 helpString += "The cluster.split command should be in the following format: \n";
  76                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
  77                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
  78                 return helpString;
  79         }
  80         catch(exception& e) {
  81                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
  82                 exit(1);
  83         }
  84 }
  85 //**********************************************************************************************************************
  86 string ClusterSplitCommand::getOutputPattern(string type) {
  87     try {
  88         string pattern = "";
  89
  90         if (type == "list") {  pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
  91         else if (type == "rabund") {  pattern = "[filename],[clustertag],rabund"; }
  92         else if (type == "sabund") {  pattern = "[filename],[clustertag],sabund"; }
  93         else if (type == "column") {  pattern = "[filename],dist"; }
  94         else if (type == "file")   {  pattern = "[filename],file"; }
  95         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
  96
  97         return pattern;
  98     }
  99     catch(exception& e) {
 100         m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
 101         exit(1);
 102     }
 103 }
 104 //**********************************************************************************************************************
 105 ClusterSplitCommand::ClusterSplitCommand(){
 106         try {
 107                 abort = true; calledHelp = true;
 108                 setParameters();
 109                 vector<string> tempOutNames;
 110                 outputTypes["list"] = tempOutNames;
 111                 outputTypes["rabund"] = tempOutNames;
 112                 outputTypes["sabund"] = tempOutNames;
 113                 outputTypes["column"] = tempOutNames;
 114         outputTypes["file"] = tempOutNames;
 115         }
 116         catch(exception& e) {
 117                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 118                 exit(1);
 119         }
 120 }
 121 //**********************************************************************************************************************
 122 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
 123 ClusterSplitCommand::ClusterSplitCommand(string option)  {
 124         try{
 125                 abort = false; calledHelp = false;
 126                 format = "";
 127
 128                 //allow user to run help
 129                 if(option == "help") { help(); abort = true; calledHelp = true; }
 130                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
 131
 132                 else {
 133                         vector<string> myArray = setParameters();
 134
 135                         OptionParser parser(option);
 136                         map<string,string> parameters = parser.getParameters();
 137
 138                         ValidParameters validParameter("cluster.split");
 139
 140                         //check to make sure all parameters are valid for command
 141                         map<string,string>::iterator it;
 142                         for (it = parameters.begin(); it != parameters.end(); it++) {
 143                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
 144                                         abort = true;
 145                                 }
 146                         }
 147
 148                         //initialize outputTypes
 149                         vector<string> tempOutNames;
 150                         outputTypes["list"] = tempOutNames;
 151                         outputTypes["rabund"] = tempOutNames;
 152                         outputTypes["sabund"] = tempOutNames;
 153                         outputTypes["column"] = tempOutNames;
 154             outputTypes["file"] = tempOutNames;
 155
 156                         //if the user changes the output directory command factory will send this info to us in the output parameter
 157                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
 158
 159                                 //if the user changes the input directory command factory will send this info to us in the output parameter
 160                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
 161                         if (inputDir == "not found"){   inputDir = "";          }
 162                         else {
 163                                 string path;
 164                                 it = parameters.find("phylip");
 165                                 //user has given a template file
 166                                 if(it != parameters.end()){
 167                                         path = m->hasPath(it->second);
 168                                         //if the user has not given a path then, add inputdir. else leave path alone.
 169                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
 170                                 }
 171
 172                                 it = parameters.find("column");
 173                                 //user has given a template file
 174                                 if(it != parameters.end()){
 175                                         path = m->hasPath(it->second);
 176                                         //if the user has not given a path then, add inputdir. else leave path alone.
 177                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
 178                                 }
 179
 180                                 it = parameters.find("name");
 181                                 //user has given a template file
 182                                 if(it != parameters.end()){
 183                                         path = m->hasPath(it->second);
 184                                         //if the user has not given a path then, add inputdir. else leave path alone.
 185                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
 186                                 }
 187
 188                                 it = parameters.find("taxonomy");
 189                                 //user has given a template file
 190                                 if(it != parameters.end()){
 191                                         path = m->hasPath(it->second);
 192                                         //if the user has not given a path then, add inputdir. else leave path alone.
 193                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
 194                                 }
 195
 196                                 it = parameters.find("fasta");
 197                                 //user has given a template file
 198                                 if(it != parameters.end()){
 199                                         path = m->hasPath(it->second);
 200                                         //if the user has not given a path then, add inputdir. else leave path alone.
 201                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
 202                                 }
 203
 204                 it = parameters.find("count");
 205                                 //user has given a template file
 206                                 if(it != parameters.end()){
 207                                         path = m->hasPath(it->second);
 208                                         //if the user has not given a path then, add inputdir. else leave path alone.
 209                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
 210                                 }
 211
 212                 it = parameters.find("file");
 213                                 //user has given a template file
 214                                 if(it != parameters.end()){
 215                                         path = m->hasPath(it->second);
 216                                         //if the user has not given a path then, add inputdir. else leave path alone.
 217                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
 218                                 }
 219                         }
 220
 221                         //check for required parameters
 222                         file = validParameter.validFile(parameters, "file", true);
 223                         if (file == "not open") { file = ""; abort = true; }
 224                         else if (file == "not found") { file = ""; }
 225             else { distfile = file; }
 226
 227             phylipfile = validParameter.validFile(parameters, "phylip", true);
 228                         if (phylipfile == "not open") { abort = true; }
 229                         else if (phylipfile == "not found") { phylipfile = ""; }
 230                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
 231
 232                         columnfile = validParameter.validFile(parameters, "column", true);
 233                         if (columnfile == "not open") { abort = true; }
 234                         else if (columnfile == "not found") { columnfile = ""; }
 235                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
 236
 237                         namefile = validParameter.validFile(parameters, "name", true);
 238                         if (namefile == "not open") { abort = true; namefile = "";}
 239                         else if (namefile == "not found") { namefile = "";  }
 240                         else { m->setNameFile(namefile); }
 241
 242             countfile = validParameter.validFile(parameters, "count", true);
 243                         if (countfile == "not open") { abort = true; countfile = "";}
 244                         else if (countfile == "not found") { countfile = "";  }
 245                         else { m->setCountTableFile(countfile); }
 246
 247                         fastafile = validParameter.validFile(parameters, "fasta", true);
 248                         if (fastafile == "not open") { abort = true; }
 249                         else if (fastafile == "not found") { fastafile = ""; }
 250                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
 251
 252                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
 253                         if (taxFile == "not open") { taxFile = ""; abort = true; }
 254                         else if (taxFile == "not found") { taxFile = ""; }
 255                         else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
 256
 257                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "") && (file == "")) {
 258                                 //is there are current file available for either of these?
 259                                 //give priority to column, then phylip, then fasta
 260                                 columnfile = m->getColumnFile();
 261                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
 262                                 else {
 263                                         phylipfile = m->getPhylipFile();
 264                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
 265                                         else {
 266                                                 fastafile = m->getFastaFile();
 267                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
 268                                                 else {
 269                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a file, phylip or a column or fastafile."); m->mothurOutEndLine();
 270                                                         abort = true;
 271                                                 }
 272                                         }
 273                                 }
 274                         }
 275                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "") && (file != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: file, fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
 276
 277             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
 278
 279                         if (columnfile != "") {
 280                                 if ((namefile == "") && (countfile == "")) {
 281                                         namefile = m->getNameFile();
 282                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 283                                         else {
 284                                                 countfile = m->getCountTableFile();
 285                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
 286                         else {
 287                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
 288                             abort = true;
 289                         }
 290                                         }
 291                                 }
 292                         }
 293
 294                         if (fastafile != "") {
 295                                 if (taxFile == "") {
 296                                         taxFile = m->getTaxonomyFile();
 297                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
 298                                         else {
 299                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
 300                                                 abort = true;
 301                                         }
 302                                 }
 303
 304                                 if ((namefile == "") && (countfile == "")) {
 305                                         namefile = m->getNameFile();
 306                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
 307                                         else {
 308                                                 countfile = m->getCountTableFile();
 309                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
 310                         else {
 311                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine();
 312                             abort = true;
 313                         }
 314                                         }
 315                                 }
 316                         }
 317
 318                         //check for optional parameter and set defaults
 319                         // ...at some point should added some additional type checking...
 320                         //get user cutoff and precision or use defaults
 321                         string temp;
 322                         temp = validParameter.validFile(parameters, "precision", false);
 323                         if (temp == "not found") { temp = "100"; }
 324                         //saves precision legnth for formatting below
 325                         length = temp.length();
 326                         m->mothurConvert(temp, precision);
 327
 328                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
 329                         hard = m->isTrue(temp);
 330
 331                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
 332                         large = m->isTrue(temp);
 333
 334                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
 335                         m->setProcessors(temp);
 336                         m->mothurConvert(temp, processors);
 337
 338                         temp = validParameter.validFile(parameters, "splitmethod", false);
 339                         if ((splitmethod != "fasta") && (splitmethod != "classify")) {
 340                                 if (temp == "not found")  { splitmethod = "distance"; }
 341                                 else {  splitmethod = temp; }
 342                         }
 343
 344             temp = validParameter.validFile(parameters, "classic", false);                      if (temp == "not found") { temp = "F"; }
 345                         classic = m->isTrue(temp);
 346
 347             if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
 348
 349                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
 350                         m->mothurConvert(temp, cutoff);
 351                         cutoff += (5 / (precision * 10.0));
 352
 353                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
 354                         m->mothurConvert(temp, taxLevelCutoff);
 355
 356                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
 357
 358                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
 359                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 360
 361                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
 362                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
 363
 364                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
 365
 366                         showabund = validParameter.validFile(parameters, "showabund", false);
 367                         if (showabund == "not found") { showabund = "T"; }
 368
 369             temp = validParameter.validFile(parameters, "cluster", false);  if (temp == "not found") { temp = "T"; }
 370             runCluster = m->isTrue(temp);
 371
 372                         timing = validParameter.validFile(parameters, "timing", false);
 373                         if (timing == "not found") { timing = "F"; }
 374
 375                 }
 376         }
 377         catch(exception& e) {
 378                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 379                 exit(1);
 380         }
 381 }
 382
 383 //**********************************************************************************************************************
 384
 385 int ClusterSplitCommand::execute(){
 386         try {
 387
 388                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
 389
 390         time_t estart;
 391         vector<string> listFileNames;
 392         vector< map<string, string> > distName;
 393         set<string> labels;
 394         string singletonName = "";
 395
 396         double saveCutoff = cutoff;
 397
 398         if (file != "") {  deleteFiles = false; estart = time(NULL); singletonName = readFile(distName); }
 399         else {
 400
 401             //****************** file prep work ******************************//
 402 #ifdef USE_MPI
 403                         int pid;
 404                         int tag = 2001;
 405                         MPI_Status status;
 406                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
 407                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
 408
 409                         if (pid == 0) { //only process 0 converts and splits
 410
 411 #endif
 412
 413                 //if user gave a phylip file convert to column file
 414                 if (format == "phylip") {
 415                     estart = time(NULL);
 416                     m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
 417
 418                     ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
 419
 420                     NameAssignment* nameMap = NULL;
 421                     convert->setFormat("phylip");
 422                     convert->read(nameMap);
 423
 424                     if (m->control_pressed) {  delete convert;  return 0;  }
 425
 426                     distfile = convert->getOutputFile();
 427
 428                     //if no names file given with phylip file, create it
 429                     ListVector* listToMakeNameFile =  convert->getListVector();
 430                     if ((namefile == "") && (countfile == "")) {  //you need to make a namefile for split matrix
 431                         ofstream out;
 432                         namefile = phylipfile + ".names";
 433                         m->openOutputFile(namefile, out);
 434                         for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
 435                             string bin = listToMakeNameFile->get(i);
 436                             out << bin << '\t' << bin << endl;
 437                         }
 438                         out.close();
 439                     }
 440                     delete listToMakeNameFile;
 441                     delete convert;
 442
 443                     m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
 444                 }
 445                 if (m->control_pressed) { return 0; }
 446
 447                 estart = time(NULL);
 448                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
 449
 450                 //split matrix into non-overlapping groups
 451                 SplitMatrix* split;
 452                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large);                                                    }
 453                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large);                                    }
 454                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);  }
 455                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
 456
 457                 split->split();
 458
 459                 if (m->control_pressed) { delete split; return 0; }
 460
 461                 singletonName = split->getSingletonNames();
 462                 distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
 463                 delete split;
 464
 465                 if (m->debug) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); }
 466
 467                 //output a merged distance file
 468                 //if (splitmethod == "fasta")           { createMergedDistanceFile(distName); }
 469
 470                 if (m->control_pressed) { return 0; }
 471
 472                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
 473                 estart = time(NULL);
 474
 475                 if (!runCluster) {
 476                     printFile(singletonName, distName);
 477
 478                     m->mothurOutEndLine();
 479                     m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 480                     for (int i = 0; i < distName.size(); i++) { m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine();      }
 481                     m->mothurOutEndLine();
 482                     return 0;
 483
 484                 }
 485                 deleteFiles = true;
 486             }
 487                 //****************** break up files between processes and cluster each file set ******************************//
 488         #ifdef USE_MPI
 489                         ////you are process 0 from above////
 490
 491                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 492                         dividedNames.resize(processors);
 493
 494                         //for each file group figure out which process will complete it
 495                         //want to divide the load intelligently so the big files are spread between processes
 496                         for (int i = 0; i < distName.size(); i++) {
 497                                 int processToAssign = (i+1) % processors;
 498                                 if (processToAssign == 0) { processToAssign = processors; }
 499
 500                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 501                         }
 502
 503                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 504                         for (int i = 0; i < processors; i++) {
 505                                 int remainder = ((i+1) % processors);
 506                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 507                         }
 508
 509
 510                         //send each child the list of files it needs to process
 511                         for(int i = 1; i < processors; i++) {
 512                                 //send number of file pairs
 513                                 int num = dividedNames[i].size();
 514                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 515
 516                                 for (int j = 0; j < num; j++) { //send filenames to process i
 517                                         char tempDistFileName[1024];
 518                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
 519                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
 520
 521                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 522                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 523
 524                                         char tempNameFileName[1024];
 525                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
 526                                         int lengthName = (dividedNames[i][j].begin()->second).length();
 527
 528                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
 529                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
 530                                 }
 531                         }
 532
 533                         //process your share
 534                         listFileNames = cluster(dividedNames[0], labels);
 535
 536                         //receive the other processes info
 537                         for(int i = 1; i < processors; i++) {
 538                                 int num = dividedNames[i].size();
 539
 540                                 double tempCutoff;
 541                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
 542                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
 543
 544                                 //send list filenames to root process
 545                                 for (int j = 0; j < num; j++) {
 546                                         int lengthList = 0;
 547                                         char tempListFileName[1024];
 548
 549                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 550                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 551
 552                                         string myListFileName = tempListFileName;
 553                                         myListFileName = myListFileName.substr(0, lengthList);
 554
 555                                         listFileNames.push_back(myListFileName);
 556                                 }
 557
 558                                 //send Labels to root process
 559                                 int numLabels = 0;
 560                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 561
 562                                 for (int j = 0; j < numLabels; j++) {
 563                                         int lengthLabel = 0;
 564                                         char tempLabel[100];
 565
 566                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
 567                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
 568
 569                                         string myLabel = tempLabel;
 570                                         myLabel = myLabel.substr(0, lengthLabel);
 571
 572                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
 573                                 }
 574                         }
 575
 576                 }else { //you are a child process
 577                         vector < map<string, string> >  myNames;
 578
 579                         //recieve the files you need to process
 580                         //receive number of file pairs
 581                         int num = 0;
 582                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 583
 584                         myNames.resize(num);
 585
 586                         for (int j = 0; j < num; j++) { //receive filenames to process
 587                                 int lengthDist = 0;
 588                                 char tempDistFileName[1024];
 589
 590                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 591                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 592
 593                                 string myDistFileName = tempDistFileName;
 594                                 myDistFileName = myDistFileName.substr(0, lengthDist);
 595
 596                                 int lengthName = 0;
 597                                 char tempNameFileName[1024];
 598
 599                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
 600                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
 601
 602                                 string myNameFileName = tempNameFileName;
 603                                 myNameFileName = myNameFileName.substr(0, lengthName);
 604
 605                                 //save file name
 606                                 myNames[j][myDistFileName] = myNameFileName;
 607                         }
 608
 609                         //process them
 610                         listFileNames = cluster(myNames, labels);
 611
 612                         //send cutoff
 613                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
 614
 615                         //send list filenames to root process
 616                         for (int j = 0; j < num; j++) {
 617                                 char tempListFileName[1024];
 618                                 strcpy(tempListFileName, listFileNames[j].c_str());
 619                                 int lengthList = listFileNames[j].length();
 620
 621                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 622                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 623                         }
 624
 625                         //send Labels to root process
 626                         int numLabels = labels.size();
 627                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 628
 629                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
 630                                 char tempLabel[100];
 631                                 strcpy(tempLabel, (*it).c_str());
 632                                 int lengthLabel = (*it).length();
 633
 634                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
 635                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
 636                         }
 637                 }
 638
 639                 //make everyone wait
 640                 MPI_Barrier(MPI_COMM_WORLD);
 641
 642         #else
 643                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
 644                 //sanity check
 645                 if (processors > distName.size()) { processors = distName.size(); }
 646
 647                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 648                                 if(processors == 1){
 649                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 650                                 }else{
 651                                         listFileNames = createProcesses(distName, labels);
 652                 }
 653                 #else
 654                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 655                 #endif
 656         #endif
 657                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
 658
 659                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
 660
 661                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
 662
 663                 //****************** merge list file and create rabund and sabund files ******************************//
 664                 estart = time(NULL);
 665                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
 666
 667                 #ifdef USE_MPI
 668                         if (pid == 0) { //only process 0 merges
 669                 #endif
 670
 671                 ListVector* listSingle;
 672                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 673
 674                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 675
 676                 mergeLists(listFileNames, labelBins, listSingle);
 677
 678                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 679
 680         //delete after all are complete incase a crash happens
 681         if (!deleteFiles) { for (int i = 0; i < distName.size(); i++) { m->mothurRemove(distName[i].begin()->first); m->mothurRemove(distName[i].begin()->second);      } }
 682
 683                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 684
 685                 //set list file as new current listfile
 686                 string current = "";
 687                 itTypes = outputTypes.find("list");
 688                 if (itTypes != outputTypes.end()) {
 689                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
 690                 }
 691
 692                 //set rabund file as new current rabundfile
 693                 itTypes = outputTypes.find("rabund");
 694                 if (itTypes != outputTypes.end()) {
 695                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
 696                 }
 697
 698                 //set sabund file as new current sabundfile
 699                 itTypes = outputTypes.find("sabund");
 700                 if (itTypes != outputTypes.end()) {
 701                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
 702                 }
 703
 704                 m->mothurOutEndLine();
 705                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 706                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
 707                 m->mothurOutEndLine();
 708
 709                 #ifdef USE_MPI
 710                         } //only process 0 merges
 711
 712                         //make everyone wait
 713                         MPI_Barrier(MPI_COMM_WORLD);
 714                 #endif
 715
 716                 return 0;
 717         }
 718         catch(exception& e) {
 719                 m->errorOut(e, "ClusterSplitCommand", "execute");
 720                 exit(1);
 721         }
 722 }
 723 //**********************************************************************************************************************
 724 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
 725         try {
 726
 727                 map<float, int> labelBin;
 728                 vector<float> orderFloat;
 729                 int numSingleBins;
 730
 731                 //read in singletons
 732                 if (singleton != "none") {
 733
 734             ifstream in;
 735             m->openInputFile(singleton, in);
 736
 737                         string firstCol, secondCol;
 738                         listSingle = new ListVector();
 739
 740             if (countfile != "") { m->getline(in); m->gobble(in); }
 741
 742                         while (!in.eof()) {
 743                                 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
 744                                 if (countfile == "") { listSingle->push_back(secondCol); }
 745                 else { listSingle->push_back(firstCol); }
 746                         }
 747
 748                         in.close();
 749                         m->mothurRemove(singleton);
 750
 751                         numSingleBins = listSingle->getNumBins();
 752                 }else{  listSingle = NULL; numSingleBins = 0;  }
 753
 754                 //go through users set and make them floats so we can sort them
 755                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
 756                         float temp = -10.0;
 757
 758                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
 759                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
 760
 761                         if (temp <= cutoff) {
 762                                 orderFloat.push_back(temp);
 763                                 labelBin[temp] = numSingleBins; //initialize numbins
 764                         }
 765                 }
 766
 767                 //sort order
 768                 sort(orderFloat.begin(), orderFloat.end());
 769                 userLabels.clear();
 770
 771                 //get the list info from each file
 772                 for (int k = 0; k < listNames.size(); k++) {
 773
 774                         if (m->control_pressed) {
 775                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
 776                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
 777                                 return labelBin;
 778                         }
 779
 780                         InputData* input = new InputData(listNames[k], "list");
 781                         ListVector* list = input->getListVector();
 782                         string lastLabel = list->getLabel();
 783
 784                         string filledInList = listNames[k] + "filledInTemp";
 785                         ofstream outFilled;
 786                         m->openOutputFile(filledInList, outFilled);
 787
 788                         //for each label needed
 789                         for(int l = 0; l < orderFloat.size(); l++){
 790
 791                                 string thisLabel;
 792                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
 793                                 else { thisLabel = toString(orderFloat[l],  length-1);  }
 794
 795                                 //this file has reached the end
 796                                 if (list == NULL) {
 797                                         list = input->getListVector(lastLabel, true);
 798                                 }else{  //do you have the distance, or do you need to fill in
 799
 800                                         float labelFloat;
 801                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
 802                                         else { convert(list->getLabel(), labelFloat); }
 803
 804                                         //check for missing labels
 805                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
 806                                                 //if its bigger get last label, otherwise keep it
 807                                                 delete list;
 808                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
 809                                         }
 810                                         lastLabel = list->getLabel();
 811                                 }
 812
 813                                 //print to new file
 814                                 list->setLabel(thisLabel);
 815                                 list->print(outFilled);
 816
 817                                 //update labelBin
 818                                 labelBin[orderFloat[l]] += list->getNumBins();
 819
 820                                 delete list;
 821
 822                                 list = input->getListVector();
 823                         }
 824
 825                         if (list != NULL) { delete list; }
 826                         delete input;
 827
 828                         outFilled.close();
 829                         m->mothurRemove(listNames[k]);
 830                         rename(filledInList.c_str(), listNames[k].c_str());
 831                 }
 832
 833                 return labelBin;
 834         }
 835         catch(exception& e) {
 836                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
 837                 exit(1);
 838         }
 839 }
 840 //**********************************************************************************************************************
 841 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
 842         try {
 843                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
 844                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
 845
 846         map<string, string> variables;
 847         variables["[filename]"] = fileroot;
 848         variables["[clustertag]"] = tag;
 849         string sabundFileName = getOutputFileName("sabund", variables);
 850         string rabundFileName = getOutputFileName("rabund", variables);
 851         if (countfile != "") { variables["[tag2]"] = "unique_list"; }
 852         string listFileName = getOutputFileName("list", variables);
 853
 854         if (countfile == "") {
 855             m->openOutputFile(sabundFileName,   outSabund);
 856             m->openOutputFile(rabundFileName,   outRabund);
 857             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
 858             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
 859
 860         }
 861                 m->openOutputFile(listFileName, outList);
 862         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
 863
 864                 map<float, int>::iterator itLabel;
 865
 866         //clears out junk for autocompleting of list files above.  Perhaps there is a beter way to handle this from within the data structure?
 867         m->printedListHeaders = false;
 868
 869                 //for each label needed
 870                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
 871
 872                         string thisLabel;
 873                         if (itLabel->first == -1) { thisLabel = "unique"; }
 874                         else { thisLabel = toString(itLabel->first,  length-1);  }
 875
 876                         //outList << thisLabel << '\t' << itLabel->second << '\t';
 877
 878             RAbundVector* rabund = NULL;
 879             ListVector completeList;
 880             completeList.setLabel(thisLabel);
 881
 882             if (countfile == "") {
 883                 rabund = new RAbundVector();
 884                 rabund->setLabel(thisLabel);
 885             }
 886
 887                         //add in singletons
 888                         if (listSingle != NULL) {
 889                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
 890                                         //outList << listSingle->get(j) << '\t';
 891                     completeList.push_back(listSingle->get(j));
 892                                         if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
 893                                 }
 894                         }
 895
 896                         //get the list info from each file
 897                         for (int k = 0; k < listNames.size(); k++) {
 898
 899                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } if (rabund != NULL) { delete rabund; } return 0; }
 900
 901                                 InputData* input = new InputData(listNames[k], "list");
 902                                 ListVector* list = input->getListVector(thisLabel);
 903
 904                                 //this file has reached the end
 905                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }
 906                                 else {
 907                                         for (int j = 0; j < list->getNumBins(); j++) {
 908                                                 //outList << list->get(j) << '\t';
 909                         completeList.push_back(list->get(j));
 910                                                 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
 911                                         }
 912                                         delete list;
 913                                 }
 914                                 delete input;
 915                         }
 916
 917             if (countfile == "") {
 918                 SAbundVector sabund = rabund->getSAbundVector();
 919                 sabund.print(outSabund);
 920                 rabund->print(outRabund);
 921             }
 922                         //outList << endl;
 923             if (!m->printedListHeaders) {
 924                 m->listBinLabelsInFile.clear(); completeList.printHeaders(outList); }
 925             completeList.print(outList);
 926
 927                         if (rabund != NULL) { delete rabund; }
 928                 }
 929
 930                 outList.close();
 931         if (countfile == "") {
 932             outRabund.close();
 933             outSabund.close();
 934                 }
 935                 if (listSingle != NULL) { delete listSingle;  }
 936
 937                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
 938
 939                 return 0;
 940         }
 941         catch(exception& e) {
 942                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
 943                 exit(1);
 944         }
 945 }
 946
 947 //**********************************************************************************************************************
 948
 949 void ClusterSplitCommand::printData(ListVector* oldList){
 950         try {
 951                 string label = oldList->getLabel();
 952                 RAbundVector oldRAbund = oldList->getRAbundVector();
 953
 954                 oldRAbund.setLabel(label);
 955                 if (m->isTrue(showabund)) {
 956                         oldRAbund.getSAbundVector().print(cout);
 957                 }
 958                 oldRAbund.print(outRabund);
 959                 oldRAbund.getSAbundVector().print(outSabund);
 960
 961                 oldList->print(outList);
 962         }
 963         catch(exception& e) {
 964                 m->errorOut(e, "ClusterSplitCommand", "printData");
 965                 exit(1);
 966         }
 967 }
 968 //**********************************************************************************************************************
 969 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
 970         try {
 971
 972         vector<string> listFiles;
 973         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 974         dividedNames.resize(processors);
 975
 976         //for each file group figure out which process will complete it
 977         //want to divide the load intelligently so the big files are spread between processes
 978         for (int i = 0; i < distName.size(); i++) {
 979             //cout << i << endl;
 980             int processToAssign = (i+1) % processors;
 981             if (processToAssign == 0) { processToAssign = processors; }
 982
 983             dividedNames[(processToAssign-1)].push_back(distName[i]);
 984             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
 985         }
 986
 987         //now lets reverse the order of ever other process, so we balance big files running with little ones
 988         for (int i = 0; i < processors; i++) {
 989             //cout << i << endl;
 990             int remainder = ((i+1) % processors);
 991             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 992         }
 993
 994         if (m->control_pressed) { return listFiles; }
 995
 996         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 997                 int process = 1;
 998                 processIDS.clear();
 999
1000                 //loop through and create all the processes you want
1001                 while (process != processors) {
1002                         pid_t pid = fork();
1003
1004                         if (pid > 0) {
1005                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1006                                 process++;
1007                         }else if (pid == 0){
1008                                 set<string> labels;
1009                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
1010
1011                                 //write out names to file
1012                                 string filename = m->mothurGetpid(process) + ".temp";
1013                                 ofstream out;
1014                                 m->openOutputFile(filename, out);
1015                                 out << tag << endl;
1016                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
1017                                 out.close();
1018
1019                                 //print out labels
1020                                 ofstream outLabels;
1021                                 filename = m->mothurGetpid(process) + ".temp.labels";
1022                                 m->openOutputFile(filename, outLabels);
1023
1024                                 outLabels << cutoff << endl;
1025                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
1026                                         outLabels << (*it) << endl;
1027                                 }
1028                                 outLabels.close();
1029
1030                                 exit(0);
1031                         }else {
1032                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1033                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1034                                 exit(0);
1035                         }
1036                 }
1037
1038         //do your part
1039         listFiles = cluster(dividedNames[0], labels);
1040
1041                 //force parent to wait until all the processes are done
1042                 for (int i=0;i< processIDS.size();i++) {
1043                         int temp = processIDS[i];
1044                         wait(&temp);
1045                 }
1046
1047         //get list of list file names from each process
1048         for(int i=0;i<processIDS.size();i++){
1049             string filename = toString(processIDS[i]) + ".temp";
1050             ifstream in;
1051             m->openInputFile(filename, in);
1052
1053             in >> tag; m->gobble(in);
1054
1055             while(!in.eof()) {
1056                 string tempName;
1057                 in >> tempName; m->gobble(in);
1058                 listFiles.push_back(tempName);
1059             }
1060             in.close();
1061             m->mothurRemove((toString(processIDS[i]) + ".temp"));
1062
1063             //get labels
1064             filename = toString(processIDS[i]) + ".temp.labels";
1065             ifstream in2;
1066             m->openInputFile(filename, in2);
1067
1068             float tempCutoff;
1069             in2 >> tempCutoff; m->gobble(in2);
1070             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1071
1072             while(!in2.eof()) {
1073                 string tempName;
1074                 in2 >> tempName; m->gobble(in2);
1075                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1076             }
1077             in2.close();
1078             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1079         }
1080
1081
1082     #else
1083
1084         //////////////////////////////////////////////////////////////////////////////////////////////////////
1085                 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
1086                 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1087                 //Taking advantage of shared memory to allow both threads to add labels.
1088                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1089                 /*
1090                 vector<clusterData*> pDataArray;
1091                 DWORD   dwThreadIdArray[processors-1];
1092                 HANDLE  hThreadArray[processors-1];
1093
1094                 //Create processor worker threads.
1095                 for( int i=1; i<processors; i++ ){
1096                         // Allocate memory for thread data.
1097                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1098                         pDataArray.push_back(tempCluster);
1099                         processIDS.push_back(i);
1100
1101                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1102                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1103                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
1104
1105                 }
1106
1107         //do your part
1108         listFiles = cluster(dividedNames[0], labels);
1109
1110                 //Wait until all threads have terminated.
1111                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1112
1113                 //Close all thread handles and free memory allocations.
1114                 for(int i=0; i < pDataArray.size(); i++){
1115             //get tag
1116             tag = pDataArray[i]->tag;
1117             //get listfiles created
1118             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1119             //get labels
1120             set<string>::iterator it;
1121             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1122                         //check cutoff
1123             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1124                         CloseHandle(hThreadArray[i]);
1125                         delete pDataArray[i];
1126                 }
1127 */
1128         #endif
1129
1130         return listFiles;
1131
1132         }
1133         catch(exception& e) {
1134                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1135                 exit(1);
1136         }
1137 }
1138 //**********************************************************************************************************************
1139
1140 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1141         try {
1142
1143                 vector<string> listFileNames;
1144                 double smallestCutoff = cutoff;
1145
1146                 //cluster each distance file
1147                 for (int i = 0; i < distNames.size(); i++) {
1148
1149                         string thisNamefile = distNames[i].begin()->second;
1150                         string thisDistFile = distNames[i].begin()->first;
1151
1152                         string listFileName = "";
1153             if (classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff);   }
1154             else            {  listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff);          }
1155
1156                         if (m->control_pressed) { //clean up
1157                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1158                                 listFileNames.clear(); return listFileNames;
1159                         }
1160
1161             listFileNames.push_back(listFileName);
1162         }
1163
1164                 cutoff = smallestCutoff;
1165
1166                 return listFileNames;
1167
1168         }
1169         catch(exception& e) {
1170                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1171                 exit(1);
1172         }
1173
1174
1175 }
1176 //**********************************************************************************************************************
1177 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1178         try {
1179         string listFileName = "";
1180
1181         ListVector* list = NULL;
1182         ListVector oldList;
1183         RAbundVector* rabund = NULL;
1184
1185 #ifdef USE_MPI
1186         int pid;
1187         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1188
1189         //output your files too
1190         if (pid != 0) {
1191             cout << endl << "Reading " << thisDistFile << endl;
1192         }
1193 #endif
1194
1195         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1196
1197         //reads phylip file storing data in 2D vector, also fills list and rabund
1198         bool sim = false;
1199                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1200
1201         NameAssignment* nameMap = NULL;
1202         CountTable* ct = NULL;
1203         if(namefile != ""){
1204                         nameMap = new NameAssignment(thisNamefile);
1205                         nameMap->readMap();
1206             cluster->readPhylipFile(thisDistFile, nameMap);
1207                 }else if (countfile != "") {
1208             ct = new CountTable();
1209             ct->readTable(thisNamefile, false, false);
1210             cluster->readPhylipFile(thisDistFile, ct);
1211         }
1212         tag = cluster->getTag();
1213
1214                 if (m->control_pressed) { if(namefile != ""){   delete nameMap; }
1215             else { delete ct; } delete cluster; return 0; }
1216
1217                 list = cluster->getListVector();
1218                 rabund = cluster->getRAbundVector();
1219
1220                 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1221                 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1222         listFileName = fileroot+ tag + ".list";
1223
1224         ofstream listFile;
1225                 m->openOutputFile(fileroot+ tag + ".list",      listFile);
1226
1227                 float previousDist = 0.00000;
1228                 float rndPreviousDist = 0.00000;
1229                 oldList = *list;
1230
1231 #ifdef USE_MPI
1232         //output your files too
1233         if (pid != 0) {
1234             cout << endl << "Clustering " << thisDistFile << endl;
1235         }
1236 #endif
1237
1238         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1239
1240                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1241                         if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  if(namefile != ""){    delete nameMap; }
1242                 else { delete ct; } return listFileName;  }
1243
1244                         cluster->update(cutoff);
1245
1246                         float dist = cluster->getSmallDist();
1247                         float rndDist;
1248                         if (hard) {
1249                                 rndDist = m->ceilDist(dist, precision);
1250                         }else{
1251                                 rndDist = m->roundDist(dist, precision);
1252                         }
1253
1254             if(previousDist <= 0.0000 && dist != previousDist){
1255                 oldList.setLabel("unique");
1256                 oldList.print(listFile);
1257                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1258             }
1259             else if(rndDist != rndPreviousDist){
1260                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1261                 oldList.print(listFile);
1262                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1263             }
1264
1265
1266                         previousDist = dist;
1267                         rndPreviousDist = rndDist;
1268                         oldList = *list;
1269                 }
1270
1271                 if(previousDist <= 0.0000){
1272             oldList.setLabel("unique");
1273             oldList.print(listFile);
1274             if (labels.count("unique") == 0) { labels.insert("unique"); }
1275         }
1276         else if(rndPreviousDist<cutoff){
1277             oldList.setLabel(toString(rndPreviousDist,  length-1));
1278             oldList.print(listFile);
1279             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1280         }
1281
1282
1283                 listFile.close();
1284
1285                 delete cluster;  delete list; delete rabund;
1286         if(namefile != ""){     delete nameMap; }
1287         else { delete ct; }
1288
1289         if (deleteFiles) {
1290             m->mothurRemove(thisDistFile);
1291             m->mothurRemove(thisNamefile);
1292         }
1293         return listFileName;
1294
1295         }
1296         catch(exception& e) {
1297                 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1298                 exit(1);
1299         }
1300 }
1301
1302 //**********************************************************************************************************************
1303 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1304         try {
1305         string listFileName = "";
1306
1307         Cluster* cluster = NULL;
1308         SparseDistanceMatrix* matrix = NULL;
1309         ListVector* list = NULL;
1310         ListVector oldList;
1311         RAbundVector* rabund = NULL;
1312
1313         if (m->control_pressed) { return listFileName; }
1314
1315 #ifdef USE_MPI
1316         int pid;
1317         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1318
1319         //output your files too
1320         if (pid != 0) {
1321             cout << endl << "Reading " << thisDistFile << endl;
1322         }
1323 #endif
1324
1325         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1326
1327         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1328         read->setCutoff(cutoff);
1329
1330         NameAssignment* nameMap = NULL;
1331         CountTable* ct = NULL;
1332                 if(namefile != ""){
1333                         nameMap = new NameAssignment(thisNamefile);
1334                         nameMap->readMap();
1335             read->read(nameMap);
1336                 }else if (countfile != "") {
1337             ct = new CountTable();
1338             ct->readTable(thisNamefile, false, false);
1339             read->read(ct);
1340         }else { read->read(nameMap); }
1341
1342                 list = read->getListVector();
1343         oldList = *list;
1344                 matrix = read->getDMatrix();
1345
1346                 if(countfile != "") {
1347             rabund = new RAbundVector();
1348             createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1349             delete ct;
1350         }else { rabund = new RAbundVector(list->getRAbundVector()); }
1351
1352         delete read;  read = NULL;
1353         if (namefile != "") { delete nameMap; nameMap = NULL; }
1354
1355
1356 #ifdef USE_MPI
1357         //output your files too
1358         if (pid != 0) {
1359             cout << endl << "Clustering " << thisDistFile << endl;
1360         }
1361 #endif
1362
1363         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1364
1365         //create cluster
1366         float adjust = -1.0;
1367         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method, adjust); }
1368         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method, adjust); }
1369         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method, adjust);     }
1370         tag = cluster->getTag();
1371
1372         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1373         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1374
1375         ofstream listFile;
1376         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1377
1378         listFileName = fileroot+ tag + ".list";
1379
1380         float previousDist = 0.00000;
1381         float rndPreviousDist = 0.00000;
1382
1383         oldList = *list;
1384
1385         print_start = true;
1386         start = time(NULL);
1387         double saveCutoff = cutoff;
1388
1389         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1390
1391             if (m->control_pressed) { //clean up
1392                 delete matrix; delete list;     delete cluster; delete rabund;
1393                 listFile.close();
1394                 m->mothurRemove(listFileName);
1395                 return listFileName;
1396             }
1397
1398             cluster->update(saveCutoff);
1399
1400             float dist = matrix->getSmallDist();
1401             float rndDist;
1402             if (hard) {
1403                 rndDist = m->ceilDist(dist, precision);
1404             }else{
1405                 rndDist = m->roundDist(dist, precision);
1406             }
1407
1408             if(previousDist <= 0.0000 && dist != previousDist){
1409                 oldList.setLabel("unique");
1410                 oldList.print(listFile);
1411                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1412             }
1413             else if(rndDist != rndPreviousDist){
1414                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1415                 oldList.print(listFile);
1416                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1417             }
1418
1419             previousDist = dist;
1420             rndPreviousDist = rndDist;
1421             oldList = *list;
1422         }
1423
1424
1425         if(previousDist <= 0.0000){
1426             oldList.setLabel("unique");
1427             oldList.print(listFile);
1428             if (labels.count("unique") == 0) { labels.insert("unique"); }
1429         }
1430         else if(rndPreviousDist<cutoff){
1431             oldList.setLabel(toString(rndPreviousDist,  length-1));
1432             oldList.print(listFile);
1433             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1434         }
1435
1436         delete matrix; delete list;     delete cluster; delete rabund;
1437         matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1438         listFile.close();
1439
1440         if (m->control_pressed) { //clean up
1441             m->mothurRemove(listFileName);
1442             return listFileName;
1443         }
1444
1445         if (deleteFiles) {
1446             m->mothurRemove(thisDistFile);
1447             m->mothurRemove(thisNamefile);
1448         }
1449
1450         if (saveCutoff != cutoff) {
1451             if (hard)   {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1452             else                {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1453
1454             m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1455         }
1456
1457         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1458
1459         return listFileName;
1460
1461         }
1462         catch(exception& e) {
1463                 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1464                 exit(1);
1465         }
1466 }
1467 //**********************************************************************************************************************
1468
1469 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1470         try{
1471
1472 #ifdef USE_MPI
1473                 int pid;
1474                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1475
1476                 if (pid != 0) {
1477 #endif
1478
1479                 string thisOutputDir = outputDir;
1480                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1481         map<string, string> variables;
1482         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
1483                 string outputFileName = getOutputFileName("column", variables);
1484                 m->mothurRemove(outputFileName);
1485
1486
1487                 for (int i = 0; i < distNames.size(); i++) {
1488                         if (m->control_pressed) {  return 0; }
1489
1490                         string thisDistFile = distNames[i].begin()->first;
1491
1492                         m->appendFiles(thisDistFile, outputFileName);
1493                 }
1494
1495                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1496
1497 #ifdef USE_MPI
1498                 }
1499 #endif
1500
1501                 return 0;
1502
1503
1504         }
1505         catch(exception& e) {
1506                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1507                 exit(1);
1508         }
1509 }
1510 //**********************************************************************************************************************
1511 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1512     try {
1513         rabund->setLabel(list->getLabel());
1514         for(int i = 0; i < list->getNumBins(); i++) {
1515             if (m->control_pressed) { break; }
1516             vector<string> binNames;
1517             string bin = list->get(i);
1518             m->splitAtComma(bin, binNames);
1519             int total = 0;
1520             for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
1521             rabund->push_back(total);
1522         }
1523         return 0;
1524     }
1525     catch(exception& e) {
1526                 m->errorOut(e, "ClusterCommand", "createRabund");
1527                 exit(1);
1528         }
1529
1530 }
1531 //**********************************************************************************************************************
1532 int ClusterSplitCommand::printFile(string singleton, vector< map<string, string> >& distName){
1533     try {
1534         ofstream out;
1535         map<string, string> variables;
1536         string thisOutputDir = outputDir;
1537                 if (outputDir == "") { thisOutputDir = m->hasPath(distfile); }
1538         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(distfile));
1539                 string outputFileName = getOutputFileName("file", variables);
1540         m->openOutputFile(outputFileName, out);
1541
1542         out << singleton << endl;
1543         if (namefile != "") { out << "name" << endl; }
1544         else if (countfile != "") { out << "count" << endl; }
1545         else { out << "unknown" << endl; }
1546
1547         for (int i = 0; i < distName.size(); i++) {    out << distName[i].begin()->first << '\t' << distName[i].begin()->second << endl;        }
1548         out.close();
1549
1550         return 0;
1551     }
1552     catch(exception& e) {
1553                 m->errorOut(e, "ClusterCommand", "printFile");
1554                 exit(1);
1555         }
1556
1557 }
1558 //**********************************************************************************************************************
1559 string ClusterSplitCommand::readFile(vector< map<string, string> >& distName){
1560     try {
1561         string singleton, thiscolumn, thisname, type;
1562
1563         ifstream in;
1564         m->openInputFile(file, in);
1565
1566         in >> singleton; m->gobble(in);
1567         in >> type; m->gobble(in);
1568
1569         if (type == "name") { namefile = "name"; }
1570         else if (type == "count") { countfile = "count"; }
1571         else {  m->mothurOut("[ERROR]: unknown file type. Are the files in column 2 of the file name files or count files? Please change unknown to name or count.\n"); m->control_pressed = true; }
1572
1573
1574         while(!in.eof()) {
1575             if (m->control_pressed) { break; }
1576
1577             in >> thiscolumn; m->gobble(in);
1578             in >> thisname; m->gobble(in);
1579
1580             map<string, string> temp;
1581             temp[thiscolumn] = thisname;
1582             distName.push_back(temp);
1583         }
1584         in.close();
1585
1586         return singleton;
1587     }
1588     catch(exception& e) {
1589                 m->errorOut(e, "ClusterCommand", "readFile");
1590                 exit(1);
1591         }
1592
1593 }
1594 //**********************************************************************************************************************