clustersplitcommand.cpp

   1 /*
   2  *  clustersplitcommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 5/19/10.
   6  *  Copyright 2010 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "clustersplitcommand.h"
  11 #include "readcluster.h"
  12 #include "splitmatrix.h"
  13 #include "readphylip.h"
  14 #include "readcolumn.h"
  15 #include "readmatrix.hpp"
  16 #include "inputdata.h"
  17
  18 //**********************************************************************************************************************
  19 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
  20 ClusterSplitCommand::ClusterSplitCommand(string option)  {
  21         try{
  22                 globaldata = GlobalData::getInstance();
  23                 abort = false;
  24
  25                 //allow user to run help
  26                 if(option == "help") { help(); abort = true; }
  27
  28                 else {
  29                         //valid paramters for this command
  30                         string Array[] =  {"phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
  31                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
  32
  33                         OptionParser parser(option);
  34                         map<string,string> parameters = parser.getParameters();
  35
  36                         ValidParameters validParameter;
  37
  38                         //check to make sure all parameters are valid for command
  39                         map<string,string>::iterator it;
  40                         for (it = parameters.begin(); it != parameters.end(); it++) {
  41                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
  42                                         abort = true;
  43                                 }
  44                         }
  45
  46                         globaldata->newRead();
  47
  48                         //if the user changes the output directory command factory will send this info to us in the output parameter
  49                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
  50
  51                                 //if the user changes the input directory command factory will send this info to us in the output parameter
  52                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
  53                         if (inputDir == "not found"){   inputDir = "";          }
  54                         else {
  55                                 string path;
  56                                 it = parameters.find("phylip");
  57                                 //user has given a template file
  58                                 if(it != parameters.end()){
  59                                         path = hasPath(it->second);
  60                                         //if the user has not given a path then, add inputdir. else leave path alone.
  61                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
  62                                 }
  63
  64                                 it = parameters.find("column");
  65                                 //user has given a template file
  66                                 if(it != parameters.end()){
  67                                         path = hasPath(it->second);
  68                                         //if the user has not given a path then, add inputdir. else leave path alone.
  69                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
  70                                 }
  71
  72                                 it = parameters.find("name");
  73                                 //user has given a template file
  74                                 if(it != parameters.end()){
  75                                         path = hasPath(it->second);
  76                                         //if the user has not given a path then, add inputdir. else leave path alone.
  77                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
  78                                 }
  79
  80                                 it = parameters.find("taxonomy");
  81                                 //user has given a template file
  82                                 if(it != parameters.end()){
  83                                         path = hasPath(it->second);
  84                                         //if the user has not given a path then, add inputdir. else leave path alone.
  85                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
  86                                 }
  87                         }
  88
  89                         //check for required parameters
  90                         phylipfile = validParameter.validFile(parameters, "phylip", true);
  91                         if (phylipfile == "not open") { abort = true; }
  92                         else if (phylipfile == "not found") { phylipfile = ""; }
  93                         else {  distfile = phylipfile;  format = "phylip";      }
  94
  95                         columnfile = validParameter.validFile(parameters, "column", true);
  96                         if (columnfile == "not open") { abort = true; }
  97                         else if (columnfile == "not found") { columnfile = ""; }
  98                         else {  distfile = columnfile; format = "column";       }
  99
 100                         namefile = validParameter.validFile(parameters, "name", true);
 101                         if (namefile == "not open") { abort = true; }
 102                         else if (namefile == "not found") { namefile = ""; }
 103
 104                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
 105                         if (taxFile == "not open") { abort = true; }
 106                         else if (taxFile == "not found") { taxFile = ""; }
 107
 108                         if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
 109                         else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
 110
 111                         if (columnfile != "") {
 112                                 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
 113                         }
 114
 115                         //check for optional parameter and set defaults
 116                         // ...at some point should added some additional type checking...
 117                         //get user cutoff and precision or use defaults
 118                         string temp;
 119                         temp = validParameter.validFile(parameters, "precision", false);
 120                         if (temp == "not found") { temp = "100"; }
 121                         //saves precision legnth for formatting below
 122                         length = temp.length();
 123                         convert(temp, precision);
 124
 125                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
 126                         hard = isTrue(temp);
 127
 128                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
 129                         large = isTrue(temp);
 130
 131                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
 132                         convert(temp, processors);
 133
 134                         splitmethod = validParameter.validFile(parameters, "splitmethod", false);               if (splitmethod == "not found") { splitmethod = "distance"; }
 135
 136                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
 137                         convert(temp, cutoff);
 138                         cutoff += (5 / (precision * 10.0));
 139
 140                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
 141                         convert(temp, taxLevelCutoff);
 142
 143                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
 144
 145                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
 146                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 147
 148                         if ((splitmethod == "distance") || (splitmethod == "classify")) { }
 149                         else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance or classify."); m->mothurOutEndLine(); abort = true; }
 150
 151                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
 152
 153                         showabund = validParameter.validFile(parameters, "showabund", false);
 154                         if (showabund == "not found") { showabund = "T"; }
 155
 156                         timing = validParameter.validFile(parameters, "timing", false);
 157                         if (timing == "not found") { timing = "F"; }
 158
 159                 }
 160         }
 161         catch(exception& e) {
 162                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 163                 exit(1);
 164         }
 165 }
 166
 167 //**********************************************************************************************************************
 168
 169 void ClusterSplitCommand::help(){
 170         try {
 171                 m->mothurOut("The cluster.split command parameter options are phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Phylip or column and name are required.\n");
 172                 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
 173                 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
 174                 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
 175                 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
 176                 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
 177                 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance or classify. \n");
 178                 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
 179                 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n");
 180                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
 181                 m->mothurOut("The cluster.split command should be in the following format: \n");
 182                 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
 183                 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
 184
 185         }
 186         catch(exception& e) {
 187                 m->errorOut(e, "ClusterSplitCommand", "help");
 188                 exit(1);
 189         }
 190 }
 191
 192 //**********************************************************************************************************************
 193
 194 ClusterSplitCommand::~ClusterSplitCommand(){}
 195
 196 //**********************************************************************************************************************
 197
 198 int ClusterSplitCommand::execute(){
 199         try {
 200
 201                 if (abort == true) {    return 0;       }
 202
 203                 //****************** file prep work ******************************//
 204
 205                 //if user gave a phylip file convert to column file
 206                 if (format == "phylip") {
 207
 208                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
 209
 210                         NameAssignment* nameMap = NULL;
 211                         convert->setFormat("phylip");
 212                         convert->read(nameMap);
 213
 214                         if (m->control_pressed) {  delete convert;  return 0;  }
 215
 216                         distfile = convert->getOutputFile();
 217
 218                         //if no names file given with phylip file, create it
 219                         ListVector* listToMakeNameFile =  convert->getListVector();
 220                         if (namefile == "") {  //you need to make a namefile for split matrix
 221                                 ofstream out;
 222                                 namefile = phylipfile + ".names";
 223                                 openOutputFile(namefile, out);
 224                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
 225                                         string bin = listToMakeNameFile->get(i);
 226                                         out << bin << '\t' << bin << endl;
 227                                 }
 228                                 out.close();
 229                         }
 230                         delete listToMakeNameFile;
 231                         delete convert;
 232                 }
 233                 if (m->control_pressed) { return 0; }
 234
 235                 time_t estart = time(NULL);
 236                 m->mothurOut("Splitting the distance file..."); m->mothurOutEndLine();
 237
 238                 //split matrix into non-overlapping groups
 239                 SplitMatrix* split;
 240                 if (splitmethod == "distance")  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                       }
 241                 else                                                    {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);   }
 242
 243                 split->split();
 244
 245                 if (m->control_pressed) { delete split; return 0; }
 246
 247                 string singletonName = split->getSingletonNames();
 248                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
 249                 delete split;
 250
 251                 if (m->control_pressed) { return 0; }
 252
 253                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
 254                 estart = time(NULL);
 255
 256                 //****************** break up files between processes and cluster each file set ******************************//
 257                 vector<string> listFileNames;
 258                 set<string> labels;
 259                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 260                                 if(processors == 1){
 261                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 262                                 }else{
 263                                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 264                                         dividedNames.resize(processors);
 265
 266                                         //for each file group figure out which process will complete it
 267                                         //want to divide the load intelligently so the big files are spread between processes
 268                                         int count = 1;
 269                                         for (int i = 0; i < distName.size(); i++) {
 270                                                 int processToAssign = (i+1) % processors;
 271                                                 if (processToAssign == 0) { processToAssign = processors; }
 272
 273                                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 274                                         }
 275
 276                                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 277                                         for (int i = 0; i < processors; i++) {
 278                                                 int remainder = ((i+1) % processors);
 279                                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 280                                         }
 281
 282                                         createProcesses(dividedNames);
 283
 284                                         if (m->control_pressed) { return 0; }
 285
 286                                         //get list of list file names from each process
 287                                         for(int i=0;i<processors;i++){
 288                                                 string filename = toString(processIDS[i]) + ".temp";
 289                                                 ifstream in;
 290                                                 openInputFile(filename, in);
 291
 292                                                 in >> tag; gobble(in);
 293
 294                                                 while(!in.eof()) {
 295                                                         string tempName;
 296                                                         in >> tempName; gobble(in);
 297                                                         listFileNames.push_back(tempName);
 298                                                 }
 299                                                 in.close();
 300                                                 remove((toString(processIDS[i]) + ".temp").c_str());
 301
 302                                                 //get labels
 303                                                 filename = toString(processIDS[i]) + ".temp.labels";
 304                                                 ifstream in2;
 305                                                 openInputFile(filename, in2);
 306
 307                                                 while(!in2.eof()) {
 308                                                         string tempName;
 309                                                         in2 >> tempName; gobble(in2);
 310                                                         if (labels.count(tempName) == 0) { labels.insert(tempName); }
 311                                                 }
 312                                                 in2.close();
 313                                                 remove((toString(processIDS[i]) + ".temp.labels").c_str());
 314                                         }
 315                                 }
 316                 #else
 317                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 318                 #endif
 319
 320                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
 321
 322                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
 323
 324                 //****************** merge list file and create rabund and sabund files ******************************//
 325                 estart = time(NULL);
 326                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
 327
 328                 ListVector* listSingle;
 329                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 330
 331                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
 332
 333                 mergeLists(listFileNames, labelBins, listSingle);
 334
 335                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
 336
 337                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 338
 339                 m->mothurOutEndLine();
 340                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 341                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
 342                 m->mothurOutEndLine();
 343
 344                 return 0;
 345         }
 346         catch(exception& e) {
 347                 m->errorOut(e, "ClusterSplitCommand", "execute");
 348                 exit(1);
 349         }
 350 }
 351 //**********************************************************************************************************************
 352 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
 353         try {
 354
 355                 map<float, int> labelBin;
 356                 vector<float> orderFloat;
 357                 int numSingleBins;
 358
 359                 //read in singletons
 360                 if (singleton != "none") {
 361                         ifstream in;
 362                         openInputFile(singleton, in);
 363
 364                         string firstCol, secondCol;
 365                         listSingle = new ListVector();
 366                         while (!in.eof()) {
 367                                 in >> firstCol >> secondCol; gobble(in);
 368                                 listSingle->push_back(secondCol);
 369                         }
 370                         in.close();
 371                         remove(singleton.c_str());
 372
 373                         numSingleBins = listSingle->getNumBins();
 374                 }else{  listSingle = NULL; numSingleBins = 0;  }
 375
 376                 //go through users set and make them floats so we can sort them
 377                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
 378                         float temp = -10.0;
 379
 380                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
 381                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
 382
 383                         orderFloat.push_back(temp);
 384                         labelBin[temp] = numSingleBins; //initialize numbins
 385                 }
 386
 387                 //sort order
 388                 sort(orderFloat.begin(), orderFloat.end());
 389                 userLabels.clear();
 390
 391                 //get the list info from each file
 392                 for (int k = 0; k < listNames.size(); k++) {
 393
 394                         if (m->control_pressed) {
 395                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
 396                                 for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
 397                                 return labelBin;
 398                         }
 399
 400                         InputData* input = new InputData(listNames[k], "list");
 401                         ListVector* list = input->getListVector();
 402                         string lastLabel = list->getLabel();
 403
 404                         string filledInList = listNames[k] + "filledInTemp";
 405                         ofstream outFilled;
 406                         openOutputFile(filledInList, outFilled);
 407
 408                         //for each label needed
 409                         for(int l = 0; l < orderFloat.size(); l++){
 410
 411                                 string thisLabel;
 412                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
 413                                 else { thisLabel = toString(orderFloat[l],  length-1);  }
 414
 415                                 //this file has reached the end
 416                                 if (list == NULL) {
 417                                         list = input->getListVector(lastLabel, true);
 418                                 }else{  //do you have the distance, or do you need to fill in
 419
 420                                         float labelFloat;
 421                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
 422                                         else { convert(list->getLabel(), labelFloat); }
 423
 424                                         //check for missing labels
 425                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
 426                                                 //if its bigger get last label, otherwise keep it
 427                                                 delete list;
 428                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
 429                                         }
 430                                         lastLabel = list->getLabel();
 431                                 }
 432
 433                                 //print to new file
 434                                 list->setLabel(thisLabel);
 435                                 list->print(outFilled);
 436
 437                                 //update labelBin
 438                                 labelBin[orderFloat[l]] += list->getNumBins();
 439
 440                                 delete list;
 441
 442                                 list = input->getListVector();
 443                         }
 444
 445                         if (list != NULL) { delete list; }
 446                         delete input;
 447
 448                         outFilled.close();
 449                         remove(listNames[k].c_str());
 450                         rename(filledInList.c_str(), listNames[k].c_str());
 451                 }
 452
 453                 return labelBin;
 454         }
 455         catch(exception& e) {
 456                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
 457                 exit(1);
 458         }
 459 }
 460 //**********************************************************************************************************************
 461 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
 462         try {
 463                 if (outputDir == "") { outputDir += hasPath(distfile); }
 464                 fileroot = outputDir + getRootName(getSimpleName(distfile));
 465
 466                 openOutputFile(fileroot+ tag + ".sabund",       outSabund);
 467                 openOutputFile(fileroot+ tag + ".rabund",       outRabund);
 468                 openOutputFile(fileroot+ tag + ".list",         outList);
 469
 470                 outputNames.push_back(fileroot+ tag + ".sabund");
 471                 outputNames.push_back(fileroot+ tag + ".rabund");
 472                 outputNames.push_back(fileroot+ tag + ".list");
 473
 474                 map<float, int>::iterator itLabel;
 475
 476                 //for each label needed
 477                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
 478
 479                         string thisLabel;
 480                         if (itLabel->first == -1) { thisLabel = "unique"; }
 481                         else { thisLabel = toString(itLabel->first,  length-1);  }
 482
 483                         outList << thisLabel << '\t' << itLabel->second << '\t';
 484
 485                         RAbundVector* rabund = new RAbundVector();
 486                         rabund->setLabel(thisLabel);
 487
 488                         //add in singletons
 489                         if (listSingle != NULL) {
 490                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
 491                                         outList << listSingle->get(j) << '\t';
 492                                         rabund->push_back(getNumNames(listSingle->get(j)));
 493                                 }
 494                         }
 495
 496                         //get the list info from each file
 497                         for (int k = 0; k < listNames.size(); k++) {
 498
 499                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
 500
 501                                 InputData* input = new InputData(listNames[k], "list");
 502                                 ListVector* list = input->getListVector(thisLabel);
 503
 504                                 //this file has reached the end
 505                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }
 506                                 else {
 507                                         for (int j = 0; j < list->getNumBins(); j++) {
 508                                                 outList << list->get(j) << '\t';
 509                                                 rabund->push_back(getNumNames(list->get(j)));
 510                                         }
 511                                         delete list;
 512                                 }
 513                                 delete input;
 514                         }
 515
 516                         SAbundVector sabund = rabund->getSAbundVector();
 517
 518                         sabund.print(outSabund);
 519                         rabund->print(outRabund);
 520                         outList << endl;
 521
 522                         delete rabund;
 523                 }
 524
 525                 outList.close();
 526                 outRabund.close();
 527                 outSabund.close();
 528
 529                 if (listSingle != NULL) { delete listSingle;  }
 530
 531                 for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
 532
 533                 return 0;
 534         }
 535         catch(exception& e) {
 536                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
 537                 exit(1);
 538         }
 539 }
 540
 541 //**********************************************************************************************************************
 542
 543 void ClusterSplitCommand::printData(ListVector* oldList){
 544         try {
 545                 string label = oldList->getLabel();
 546                 RAbundVector oldRAbund = oldList->getRAbundVector();
 547
 548                 oldRAbund.setLabel(label);
 549                 if (isTrue(showabund)) {
 550                         oldRAbund.getSAbundVector().print(cout);
 551                 }
 552                 oldRAbund.print(outRabund);
 553                 oldRAbund.getSAbundVector().print(outSabund);
 554
 555                 oldList->print(outList);
 556         }
 557         catch(exception& e) {
 558                 m->errorOut(e, "ClusterSplitCommand", "printData");
 559                 exit(1);
 560         }
 561 }
 562 //**********************************************************************************************************************
 563 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
 564         try {
 565
 566         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 567                 int process = 0;
 568                 int exitCommand = 1;
 569                 processIDS.clear();
 570
 571                 //loop through and create all the processes you want
 572                 while (process != processors) {
 573                         int pid = fork();
 574
 575                         if (pid > 0) {
 576                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 577                                 process++;
 578                         }else if (pid == 0){
 579                                 set<string> labels;
 580                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
 581
 582                                 //write out names to file
 583                                 string filename = toString(getpid()) + ".temp";
 584                                 ofstream out;
 585                                 openOutputFile(filename, out);
 586                                 out << tag << endl;
 587                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
 588                                 out.close();
 589
 590                                 //print out labels
 591                                 ofstream outLabels;
 592                                 filename = toString(getpid()) + ".temp.labels";
 593                                 openOutputFile(filename, outLabels);
 594
 595                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
 596                                         outLabels << (*it) << endl;
 597                                 }
 598                                 outLabels.close();
 599
 600                                 exit(0);
 601                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
 602                 }
 603
 604                 //force parent to wait until all the processes are done
 605                 for (int i=0;i<processors;i++) {
 606                         int temp = processIDS[i];
 607                         wait(&temp);
 608                 }
 609
 610                 return exitCommand;
 611         #endif
 612
 613         }
 614         catch(exception& e) {
 615                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
 616                 exit(1);
 617         }
 618 }
 619 //**********************************************************************************************************************
 620
 621 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
 622         try {
 623                 Cluster* cluster;
 624                 SparseMatrix* matrix;
 625                 ListVector* list;
 626                 ListVector oldList;
 627                 RAbundVector* rabund;
 628
 629                 vector<string> listFileNames;
 630
 631                 //cluster each distance file
 632                 for (int i = 0; i < distNames.size(); i++) {
 633
 634                         string thisNamefile = distNames[i].begin()->second;
 635                         string thisDistFile = distNames[i].begin()->first;
 636
 637                         //read in distance file
 638                         globaldata->setNameFile(thisNamefile);
 639                         globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
 640
 641                         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
 642
 643                         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
 644                         read->setCutoff(cutoff);
 645
 646                         NameAssignment* nameMap = new NameAssignment(thisNamefile);
 647                         nameMap->readMap();
 648                         read->read(nameMap);
 649
 650                         if (m->control_pressed) {  delete read; delete nameMap; return listFileNames; }
 651
 652                         list = read->getListVector();
 653                         oldList = *list;
 654                         matrix = read->getMatrix();
 655
 656                         delete read;
 657                         delete nameMap;
 658
 659                         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
 660
 661                         rabund = new RAbundVector(list->getRAbundVector());
 662
 663                         //create cluster
 664                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
 665                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
 666                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
 667                         tag = cluster->getTag();
 668
 669                         if (outputDir == "") { outputDir += hasPath(thisDistFile); }
 670                         fileroot = outputDir + getRootName(getSimpleName(thisDistFile));
 671
 672                         ofstream listFile;
 673                         openOutputFile(fileroot+ tag + ".list", listFile);
 674
 675                         listFileNames.push_back(fileroot+ tag + ".list");
 676
 677                         time_t estart = time(NULL);
 678
 679                         float previousDist = 0.00000;
 680                         float rndPreviousDist = 0.00000;
 681
 682                         oldList = *list;
 683
 684                         print_start = true;
 685                         start = time(NULL);
 686                         double saveCutoff = cutoff;
 687
 688                         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
 689
 690                                 if (m->control_pressed) { //clean up
 691                                         delete matrix; delete list;     delete cluster; delete rabund;
 692                                         listFile.close();
 693                                         for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
 694                                         listFileNames.clear(); return listFileNames;
 695                                 }
 696
 697                                 cluster->update(cutoff);
 698
 699                                 float dist = matrix->getSmallDist();
 700                                 float rndDist;
 701                                 if (hard) {
 702                                         rndDist = ceilDist(dist, precision);
 703                                 }else{
 704                                         rndDist = roundDist(dist, precision);
 705                                 }
 706
 707                                 if(previousDist <= 0.0000 && dist != previousDist){
 708                                         oldList.setLabel("unique");
 709                                         oldList.print(listFile);
 710                                         if (labels.count("unique") == 0) {  labels.insert("unique");  }
 711                                 }
 712                                 else if(rndDist != rndPreviousDist){
 713                                         oldList.setLabel(toString(rndPreviousDist,  length-1));
 714                                         oldList.print(listFile);
 715                                         if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
 716                                 }
 717
 718                                 previousDist = dist;
 719                                 rndPreviousDist = rndDist;
 720                                 oldList = *list;
 721                         }
 722
 723
 724                         if(previousDist <= 0.0000){
 725                                 oldList.setLabel("unique");
 726                                 oldList.print(listFile);
 727                                 if (labels.count("unique") == 0) { labels.insert("unique"); }
 728                         }
 729                         else if(rndPreviousDist<cutoff){
 730                                 oldList.setLabel(toString(rndPreviousDist,  length-1));
 731                                 oldList.print(listFile);
 732                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
 733                         }
 734
 735                         delete matrix; delete list;     delete cluster; delete rabund;
 736                         listFile.close();
 737
 738                         if (m->control_pressed) { //clean up
 739                                 for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
 740                                 listFileNames.clear(); return listFileNames;
 741                         }
 742
 743                         remove(thisDistFile.c_str());
 744                         remove(thisNamefile.c_str());
 745                 }
 746
 747
 748                 return listFileNames;
 749
 750         }
 751         catch(exception& e) {
 752                 m->errorOut(e, "ClusterSplitCommand", "cluster");
 753                 exit(1);
 754         }
 755
 756
 757 }
 758
 759 //**********************************************************************************************************************