clustersplitcommand.cpp

   1 /*
   2  *  clustersplitcommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 5/19/10.
   6  *  Copyright 2010 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "clustersplitcommand.h"
  11 #include "readcluster.h"
  12 #include "splitmatrix.h"
  13 #include "readphylip.h"
  14 #include "readcolumn.h"
  15 #include "readmatrix.hpp"
  16 #include "inputdata.h"
  17
  18 //**********************************************************************************************************************
  19 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
  20 ClusterSplitCommand::ClusterSplitCommand(string option)  {
  21         try{
  22                 globaldata = GlobalData::getInstance();
  23                 abort = false;
  24                 format = "";
  25
  26                 //allow user to run help
  27                 if(option == "help") { help(); abort = true; }
  28
  29                 else {
  30                         //valid paramters for this command
  31                         string Array[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
  32                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
  33
  34                         OptionParser parser(option);
  35                         map<string,string> parameters = parser.getParameters();
  36
  37                         ValidParameters validParameter;
  38
  39                         //check to make sure all parameters are valid for command
  40                         map<string,string>::iterator it;
  41                         for (it = parameters.begin(); it != parameters.end(); it++) {
  42                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
  43                                         abort = true;
  44                                 }
  45                         }
  46
  47                         globaldata->newRead();
  48
  49                         //if the user changes the output directory command factory will send this info to us in the output parameter
  50                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
  51
  52                                 //if the user changes the input directory command factory will send this info to us in the output parameter
  53                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
  54                         if (inputDir == "not found"){   inputDir = "";          }
  55                         else {
  56                                 string path;
  57                                 it = parameters.find("phylip");
  58                                 //user has given a template file
  59                                 if(it != parameters.end()){
  60                                         path = hasPath(it->second);
  61                                         //if the user has not given a path then, add inputdir. else leave path alone.
  62                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
  63                                 }
  64
  65                                 it = parameters.find("column");
  66                                 //user has given a template file
  67                                 if(it != parameters.end()){
  68                                         path = hasPath(it->second);
  69                                         //if the user has not given a path then, add inputdir. else leave path alone.
  70                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
  71                                 }
  72
  73                                 it = parameters.find("name");
  74                                 //user has given a template file
  75                                 if(it != parameters.end()){
  76                                         path = hasPath(it->second);
  77                                         //if the user has not given a path then, add inputdir. else leave path alone.
  78                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
  79                                 }
  80
  81                                 it = parameters.find("taxonomy");
  82                                 //user has given a template file
  83                                 if(it != parameters.end()){
  84                                         path = hasPath(it->second);
  85                                         //if the user has not given a path then, add inputdir. else leave path alone.
  86                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
  87                                 }
  88
  89                                 it = parameters.find("fasta");
  90                                 //user has given a template file
  91                                 if(it != parameters.end()){
  92                                         path = hasPath(it->second);
  93                                         //if the user has not given a path then, add inputdir. else leave path alone.
  94                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
  95                                 }
  96                         }
  97
  98                         //check for required parameters
  99                         phylipfile = validParameter.validFile(parameters, "phylip", true);
 100                         if (phylipfile == "not open") { abort = true; }
 101                         else if (phylipfile == "not found") { phylipfile = ""; }
 102                         else {  distfile = phylipfile;  format = "phylip";      }
 103
 104                         columnfile = validParameter.validFile(parameters, "column", true);
 105                         if (columnfile == "not open") { abort = true; }
 106                         else if (columnfile == "not found") { columnfile = ""; }
 107                         else {  distfile = columnfile; format = "column";       }
 108
 109                         namefile = validParameter.validFile(parameters, "name", true);
 110                         if (namefile == "not open") { abort = true; }
 111                         else if (namefile == "not found") { namefile = ""; }
 112
 113                         fastafile = validParameter.validFile(parameters, "fasta", true);
 114                         if (fastafile == "not open") { abort = true; }
 115                         else if (fastafile == "not found") { fastafile = ""; }
 116                         else { splitmethod = "fasta";  }
 117
 118                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
 119                         if (taxFile == "not open") { abort = true; }
 120                         else if (taxFile == "not found") { taxFile = ""; }
 121
 122                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
 123                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
 124
 125                         if (columnfile != "") {
 126                                 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
 127                         }
 128
 129                         if (fastafile != "") {
 130                                 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
 131                         }
 132
 133                         //check for optional parameter and set defaults
 134                         // ...at some point should added some additional type checking...
 135                         //get user cutoff and precision or use defaults
 136                         string temp;
 137                         temp = validParameter.validFile(parameters, "precision", false);
 138                         if (temp == "not found") { temp = "100"; }
 139                         //saves precision legnth for formatting below
 140                         length = temp.length();
 141                         convert(temp, precision);
 142
 143                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
 144                         hard = isTrue(temp);
 145
 146                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
 147                         large = isTrue(temp);
 148
 149                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
 150                         convert(temp, processors);
 151
 152                         temp = validParameter.validFile(parameters, "splitmethod", false);
 153                         if (splitmethod != "fasta") {
 154                                 if (temp == "not found")  { splitmethod = "distance"; }
 155                                 else {  splitmethod = temp; }
 156                         }
 157
 158                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
 159                         convert(temp, cutoff);
 160                         cutoff += (5 / (precision * 10.0));
 161
 162                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
 163                         convert(temp, taxLevelCutoff);
 164
 165                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
 166
 167                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
 168                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 169
 170                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
 171                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
 172
 173                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
 174
 175                         showabund = validParameter.validFile(parameters, "showabund", false);
 176                         if (showabund == "not found") { showabund = "T"; }
 177
 178                         timing = validParameter.validFile(parameters, "timing", false);
 179                         if (timing == "not found") { timing = "F"; }
 180
 181                 }
 182         }
 183         catch(exception& e) {
 184                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 185                 exit(1);
 186         }
 187 }
 188
 189 //**********************************************************************************************************************
 190
 191 void ClusterSplitCommand::help(){
 192         try {
 193                 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
 194                 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
 195                 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
 196                 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n");
 197                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and split the distance file based on those groups. \n");
 198                 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file and taxonomy file.  \n");
 199                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
 200                 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
 201                 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
 202                 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
 203                 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
 204                 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
 205                 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
 206                 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
 207                 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
 208                 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n");
 209                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
 210                 m->mothurOut("The cluster.split command should be in the following format: \n");
 211                 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
 212                 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
 213
 214         }
 215         catch(exception& e) {
 216                 m->errorOut(e, "ClusterSplitCommand", "help");
 217                 exit(1);
 218         }
 219 }
 220
 221 //**********************************************************************************************************************
 222
 223 ClusterSplitCommand::~ClusterSplitCommand(){}
 224
 225 //**********************************************************************************************************************
 226
 227 int ClusterSplitCommand::execute(){
 228         try {
 229
 230                 if (abort == true) {    return 0;       }
 231                 time_t estart;
 232                 //****************** file prep work ******************************//
 233
 234                 //if user gave a phylip file convert to column file
 235                 if (format == "phylip") {
 236                         estart = time(NULL);
 237                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
 238
 239                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
 240
 241                         NameAssignment* nameMap = NULL;
 242                         convert->setFormat("phylip");
 243                         convert->read(nameMap);
 244
 245                         if (m->control_pressed) {  delete convert;  return 0;  }
 246
 247                         distfile = convert->getOutputFile();
 248
 249                         //if no names file given with phylip file, create it
 250                         ListVector* listToMakeNameFile =  convert->getListVector();
 251                         if (namefile == "") {  //you need to make a namefile for split matrix
 252                                 ofstream out;
 253                                 namefile = phylipfile + ".names";
 254                                 openOutputFile(namefile, out);
 255                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
 256                                         string bin = listToMakeNameFile->get(i);
 257                                         out << bin << '\t' << bin << endl;
 258                                 }
 259                                 out.close();
 260                         }
 261                         delete listToMakeNameFile;
 262                         delete convert;
 263
 264                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
 265                 }
 266                 if (m->control_pressed) { return 0; }
 267
 268                 estart = time(NULL);
 269                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
 270
 271                 //split matrix into non-overlapping groups
 272                 SplitMatrix* split;
 273                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                       }
 274                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);   }
 275                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, taxFile, taxLevelCutoff, splitmethod);                                       }
 276                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
 277
 278                 split->split();
 279
 280                 if (m->control_pressed) { delete split; return 0; }
 281
 282                 string singletonName = split->getSingletonNames();
 283                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
 284                 delete split;
 285
 286                 if (m->control_pressed) { return 0; }
 287
 288                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
 289                 estart = time(NULL);
 290
 291                 //****************** break up files between processes and cluster each file set ******************************//
 292                 vector<string> listFileNames;
 293                 set<string> labels;
 294                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 295                                 if(processors == 1){
 296                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 297                                 }else{
 298                                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 299                                         dividedNames.resize(processors);
 300
 301                                         //for each file group figure out which process will complete it
 302                                         //want to divide the load intelligently so the big files are spread between processes
 303                                         int count = 1;
 304                                         for (int i = 0; i < distName.size(); i++) {
 305                                                 int processToAssign = (i+1) % processors;
 306                                                 if (processToAssign == 0) { processToAssign = processors; }
 307
 308                                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 309                                         }
 310
 311                                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 312                                         for (int i = 0; i < processors; i++) {
 313                                                 int remainder = ((i+1) % processors);
 314                                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 315                                         }
 316
 317                                         createProcesses(dividedNames);
 318
 319                                         if (m->control_pressed) { return 0; }
 320
 321                                         //get list of list file names from each process
 322                                         for(int i=0;i<processors;i++){
 323                                                 string filename = toString(processIDS[i]) + ".temp";
 324                                                 ifstream in;
 325                                                 openInputFile(filename, in);
 326
 327                                                 in >> tag; gobble(in);
 328
 329                                                 while(!in.eof()) {
 330                                                         string tempName;
 331                                                         in >> tempName; gobble(in);
 332                                                         listFileNames.push_back(tempName);
 333                                                 }
 334                                                 in.close();
 335                                                 remove((toString(processIDS[i]) + ".temp").c_str());
 336
 337                                                 //get labels
 338                                                 filename = toString(processIDS[i]) + ".temp.labels";
 339                                                 ifstream in2;
 340                                                 openInputFile(filename, in2);
 341
 342                                                 while(!in2.eof()) {
 343                                                         string tempName;
 344                                                         in2 >> tempName; gobble(in2);
 345                                                         if (labels.count(tempName) == 0) { labels.insert(tempName); }
 346                                                 }
 347                                                 in2.close();
 348                                                 remove((toString(processIDS[i]) + ".temp.labels").c_str());
 349                                         }
 350                                 }
 351                 #else
 352                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 353                 #endif
 354
 355                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
 356
 357                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
 358
 359                 //****************** merge list file and create rabund and sabund files ******************************//
 360                 estart = time(NULL);
 361                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
 362
 363                 ListVector* listSingle;
 364                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 365
 366                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
 367
 368                 mergeLists(listFileNames, labelBins, listSingle);
 369
 370                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
 371
 372                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 373
 374                 m->mothurOutEndLine();
 375                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 376                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
 377                 m->mothurOutEndLine();
 378
 379                 return 0;
 380         }
 381         catch(exception& e) {
 382                 m->errorOut(e, "ClusterSplitCommand", "execute");
 383                 exit(1);
 384         }
 385 }
 386 //**********************************************************************************************************************
 387 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
 388         try {
 389
 390                 map<float, int> labelBin;
 391                 vector<float> orderFloat;
 392                 int numSingleBins;
 393
 394                 //read in singletons
 395                 if (singleton != "none") {
 396                         ifstream in;
 397                         openInputFile(singleton, in);
 398
 399                         string firstCol, secondCol;
 400                         listSingle = new ListVector();
 401                         while (!in.eof()) {
 402                                 in >> firstCol >> secondCol; gobble(in);
 403                                 listSingle->push_back(secondCol);
 404                         }
 405                         in.close();
 406                         remove(singleton.c_str());
 407
 408                         numSingleBins = listSingle->getNumBins();
 409                 }else{  listSingle = NULL; numSingleBins = 0;  }
 410
 411                 //go through users set and make them floats so we can sort them
 412                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
 413                         float temp = -10.0;
 414
 415                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
 416                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
 417
 418                         orderFloat.push_back(temp);
 419                         labelBin[temp] = numSingleBins; //initialize numbins
 420                 }
 421
 422                 //sort order
 423                 sort(orderFloat.begin(), orderFloat.end());
 424                 userLabels.clear();
 425
 426                 //get the list info from each file
 427                 for (int k = 0; k < listNames.size(); k++) {
 428
 429                         if (m->control_pressed) {
 430                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
 431                                 for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
 432                                 return labelBin;
 433                         }
 434
 435                         InputData* input = new InputData(listNames[k], "list");
 436                         ListVector* list = input->getListVector();
 437                         string lastLabel = list->getLabel();
 438
 439                         string filledInList = listNames[k] + "filledInTemp";
 440                         ofstream outFilled;
 441                         openOutputFile(filledInList, outFilled);
 442
 443                         //for each label needed
 444                         for(int l = 0; l < orderFloat.size(); l++){
 445
 446                                 string thisLabel;
 447                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
 448                                 else { thisLabel = toString(orderFloat[l],  length-1);  }
 449
 450                                 //this file has reached the end
 451                                 if (list == NULL) {
 452                                         list = input->getListVector(lastLabel, true);
 453                                 }else{  //do you have the distance, or do you need to fill in
 454
 455                                         float labelFloat;
 456                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
 457                                         else { convert(list->getLabel(), labelFloat); }
 458
 459                                         //check for missing labels
 460                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
 461                                                 //if its bigger get last label, otherwise keep it
 462                                                 delete list;
 463                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
 464                                         }
 465                                         lastLabel = list->getLabel();
 466                                 }
 467
 468                                 //print to new file
 469                                 list->setLabel(thisLabel);
 470                                 list->print(outFilled);
 471
 472                                 //update labelBin
 473                                 labelBin[orderFloat[l]] += list->getNumBins();
 474
 475                                 delete list;
 476
 477                                 list = input->getListVector();
 478                         }
 479
 480                         if (list != NULL) { delete list; }
 481                         delete input;
 482
 483                         outFilled.close();
 484                         remove(listNames[k].c_str());
 485                         rename(filledInList.c_str(), listNames[k].c_str());
 486                 }
 487
 488                 return labelBin;
 489         }
 490         catch(exception& e) {
 491                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
 492                 exit(1);
 493         }
 494 }
 495 //**********************************************************************************************************************
 496 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
 497         try {
 498                 if (outputDir == "") { outputDir += hasPath(distfile); }
 499                 fileroot = outputDir + getRootName(getSimpleName(distfile));
 500
 501                 openOutputFile(fileroot+ tag + ".sabund",       outSabund);
 502                 openOutputFile(fileroot+ tag + ".rabund",       outRabund);
 503                 openOutputFile(fileroot+ tag + ".list",         outList);
 504
 505                 outputNames.push_back(fileroot+ tag + ".sabund");
 506                 outputNames.push_back(fileroot+ tag + ".rabund");
 507                 outputNames.push_back(fileroot+ tag + ".list");
 508
 509                 map<float, int>::iterator itLabel;
 510
 511                 //for each label needed
 512                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
 513
 514                         string thisLabel;
 515                         if (itLabel->first == -1) { thisLabel = "unique"; }
 516                         else { thisLabel = toString(itLabel->first,  length-1);  }
 517
 518                         outList << thisLabel << '\t' << itLabel->second << '\t';
 519
 520                         RAbundVector* rabund = new RAbundVector();
 521                         rabund->setLabel(thisLabel);
 522
 523                         //add in singletons
 524                         if (listSingle != NULL) {
 525                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
 526                                         outList << listSingle->get(j) << '\t';
 527                                         rabund->push_back(getNumNames(listSingle->get(j)));
 528                                 }
 529                         }
 530
 531                         //get the list info from each file
 532                         for (int k = 0; k < listNames.size(); k++) {
 533
 534                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
 535
 536                                 InputData* input = new InputData(listNames[k], "list");
 537                                 ListVector* list = input->getListVector(thisLabel);
 538
 539                                 //this file has reached the end
 540                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }
 541                                 else {
 542                                         for (int j = 0; j < list->getNumBins(); j++) {
 543                                                 outList << list->get(j) << '\t';
 544                                                 rabund->push_back(getNumNames(list->get(j)));
 545                                         }
 546                                         delete list;
 547                                 }
 548                                 delete input;
 549                         }
 550
 551                         SAbundVector sabund = rabund->getSAbundVector();
 552
 553                         sabund.print(outSabund);
 554                         rabund->print(outRabund);
 555                         outList << endl;
 556
 557                         delete rabund;
 558                 }
 559
 560                 outList.close();
 561                 outRabund.close();
 562                 outSabund.close();
 563
 564                 if (listSingle != NULL) { delete listSingle;  }
 565
 566                 for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
 567
 568                 return 0;
 569         }
 570         catch(exception& e) {
 571                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
 572                 exit(1);
 573         }
 574 }
 575
 576 //**********************************************************************************************************************
 577
 578 void ClusterSplitCommand::printData(ListVector* oldList){
 579         try {
 580                 string label = oldList->getLabel();
 581                 RAbundVector oldRAbund = oldList->getRAbundVector();
 582
 583                 oldRAbund.setLabel(label);
 584                 if (isTrue(showabund)) {
 585                         oldRAbund.getSAbundVector().print(cout);
 586                 }
 587                 oldRAbund.print(outRabund);
 588                 oldRAbund.getSAbundVector().print(outSabund);
 589
 590                 oldList->print(outList);
 591         }
 592         catch(exception& e) {
 593                 m->errorOut(e, "ClusterSplitCommand", "printData");
 594                 exit(1);
 595         }
 596 }
 597 //**********************************************************************************************************************
 598 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
 599         try {
 600
 601         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 602                 int process = 0;
 603                 int exitCommand = 1;
 604                 processIDS.clear();
 605
 606                 //loop through and create all the processes you want
 607                 while (process != processors) {
 608                         int pid = fork();
 609
 610                         if (pid > 0) {
 611                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 612                                 process++;
 613                         }else if (pid == 0){
 614                                 set<string> labels;
 615                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
 616
 617                                 //write out names to file
 618                                 string filename = toString(getpid()) + ".temp";
 619                                 ofstream out;
 620                                 openOutputFile(filename, out);
 621                                 out << tag << endl;
 622                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
 623                                 out.close();
 624
 625                                 //print out labels
 626                                 ofstream outLabels;
 627                                 filename = toString(getpid()) + ".temp.labels";
 628                                 openOutputFile(filename, outLabels);
 629
 630                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
 631                                         outLabels << (*it) << endl;
 632                                 }
 633                                 outLabels.close();
 634
 635                                 exit(0);
 636                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
 637                 }
 638
 639                 //force parent to wait until all the processes are done
 640                 for (int i=0;i<processors;i++) {
 641                         int temp = processIDS[i];
 642                         wait(&temp);
 643                 }
 644
 645                 return exitCommand;
 646         #endif
 647
 648         }
 649         catch(exception& e) {
 650                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
 651                 exit(1);
 652         }
 653 }
 654 //**********************************************************************************************************************
 655
 656 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
 657         try {
 658                 Cluster* cluster;
 659                 SparseMatrix* matrix;
 660                 ListVector* list;
 661                 ListVector oldList;
 662                 RAbundVector* rabund;
 663
 664                 vector<string> listFileNames;
 665
 666                 //cluster each distance file
 667                 for (int i = 0; i < distNames.size(); i++) {
 668
 669                         string thisNamefile = distNames[i].begin()->second;
 670                         string thisDistFile = distNames[i].begin()->first;
 671
 672                         //read in distance file
 673                         globaldata->setNameFile(thisNamefile);
 674                         globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
 675
 676                         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
 677
 678                         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
 679                         read->setCutoff(cutoff);
 680
 681                         NameAssignment* nameMap = new NameAssignment(thisNamefile);
 682                         nameMap->readMap();
 683                         read->read(nameMap);
 684
 685                         if (m->control_pressed) {  delete read; delete nameMap; return listFileNames; }
 686
 687                         list = read->getListVector();
 688                         oldList = *list;
 689                         matrix = read->getMatrix();
 690
 691                         delete read;
 692                         delete nameMap;
 693
 694                         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
 695
 696                         rabund = new RAbundVector(list->getRAbundVector());
 697
 698                         //create cluster
 699                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
 700                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
 701                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
 702                         tag = cluster->getTag();
 703
 704                         if (outputDir == "") { outputDir += hasPath(thisDistFile); }
 705                         fileroot = outputDir + getRootName(getSimpleName(thisDistFile));
 706
 707                         ofstream listFile;
 708                         openOutputFile(fileroot+ tag + ".list", listFile);
 709
 710                         listFileNames.push_back(fileroot+ tag + ".list");
 711
 712                         time_t estart = time(NULL);
 713
 714                         float previousDist = 0.00000;
 715                         float rndPreviousDist = 0.00000;
 716
 717                         oldList = *list;
 718
 719                         print_start = true;
 720                         start = time(NULL);
 721                         double saveCutoff = cutoff;
 722
 723                         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
 724
 725                                 if (m->control_pressed) { //clean up
 726                                         delete matrix; delete list;     delete cluster; delete rabund;
 727                                         listFile.close();
 728                                         for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
 729                                         listFileNames.clear(); return listFileNames;
 730                                 }
 731
 732                                 cluster->update(cutoff);
 733
 734                                 float dist = matrix->getSmallDist();
 735                                 float rndDist;
 736                                 if (hard) {
 737                                         rndDist = ceilDist(dist, precision);
 738                                 }else{
 739                                         rndDist = roundDist(dist, precision);
 740                                 }
 741
 742                                 if(previousDist <= 0.0000 && dist != previousDist){
 743                                         oldList.setLabel("unique");
 744                                         oldList.print(listFile);
 745                                         if (labels.count("unique") == 0) {  labels.insert("unique");  }
 746                                 }
 747                                 else if(rndDist != rndPreviousDist){
 748                                         oldList.setLabel(toString(rndPreviousDist,  length-1));
 749                                         oldList.print(listFile);
 750                                         if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
 751                                 }
 752
 753                                 previousDist = dist;
 754                                 rndPreviousDist = rndDist;
 755                                 oldList = *list;
 756                         }
 757
 758
 759                         if(previousDist <= 0.0000){
 760                                 oldList.setLabel("unique");
 761                                 oldList.print(listFile);
 762                                 if (labels.count("unique") == 0) { labels.insert("unique"); }
 763                         }
 764                         else if(rndPreviousDist<cutoff){
 765                                 oldList.setLabel(toString(rndPreviousDist,  length-1));
 766                                 oldList.print(listFile);
 767                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
 768                         }
 769
 770                         delete matrix; delete list;     delete cluster; delete rabund;
 771                         listFile.close();
 772
 773                         if (m->control_pressed) { //clean up
 774                                 for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
 775                                 listFileNames.clear(); return listFileNames;
 776                         }
 777
 778                         //remove(thisDistFile.c_str());
 779                         //remove(thisNamefile.c_str());
 780                 }
 781
 782
 783                 return listFileNames;
 784
 785         }
 786         catch(exception& e) {
 787                 m->errorOut(e, "ClusterSplitCommand", "cluster");
 788                 exit(1);
 789         }
 790
 791
 792 }
 793
 794 //**********************************************************************************************************************