clustersplitcommand.cpp

   1 /*
   2  *  clustersplitcommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 5/19/10.
   6  *  Copyright 2010 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "clustersplitcommand.h"
  11 #include "readcluster.h"
  12 #include "splitmatrix.h"
  13 #include "readphylip.h"
  14 #include "readcolumn.h"
  15 #include "readmatrix.hpp"
  16 #include "inputdata.h"
  17
  18 //**********************************************************************************************************************
  19 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
  20 ClusterSplitCommand::ClusterSplitCommand(string option)  {
  21         try{
  22                 globaldata = GlobalData::getInstance();
  23                 abort = false;
  24                 format = "";
  25
  26                 //allow user to run help
  27                 if(option == "help") { help(); abort = true; }
  28
  29                 else {
  30                         //valid paramters for this command
  31                         string Array[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
  32                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
  33
  34                         OptionParser parser(option);
  35                         map<string,string> parameters = parser.getParameters();
  36
  37                         ValidParameters validParameter;
  38
  39                         //check to make sure all parameters are valid for command
  40                         map<string,string>::iterator it;
  41                         for (it = parameters.begin(); it != parameters.end(); it++) {
  42                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
  43                                         abort = true;
  44                                 }
  45                         }
  46
  47                         globaldata->newRead();
  48
  49                         //if the user changes the output directory command factory will send this info to us in the output parameter
  50                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
  51
  52                                 //if the user changes the input directory command factory will send this info to us in the output parameter
  53                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
  54                         if (inputDir == "not found"){   inputDir = "";          }
  55                         else {
  56                                 string path;
  57                                 it = parameters.find("phylip");
  58                                 //user has given a template file
  59                                 if(it != parameters.end()){
  60                                         path = hasPath(it->second);
  61                                         //if the user has not given a path then, add inputdir. else leave path alone.
  62                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
  63                                 }
  64
  65                                 it = parameters.find("column");
  66                                 //user has given a template file
  67                                 if(it != parameters.end()){
  68                                         path = hasPath(it->second);
  69                                         //if the user has not given a path then, add inputdir. else leave path alone.
  70                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
  71                                 }
  72
  73                                 it = parameters.find("name");
  74                                 //user has given a template file
  75                                 if(it != parameters.end()){
  76                                         path = hasPath(it->second);
  77                                         //if the user has not given a path then, add inputdir. else leave path alone.
  78                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
  79                                 }
  80
  81                                 it = parameters.find("taxonomy");
  82                                 //user has given a template file
  83                                 if(it != parameters.end()){
  84                                         path = hasPath(it->second);
  85                                         //if the user has not given a path then, add inputdir. else leave path alone.
  86                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
  87                                 }
  88
  89                                 it = parameters.find("fasta");
  90                                 //user has given a template file
  91                                 if(it != parameters.end()){
  92                                         path = hasPath(it->second);
  93                                         //if the user has not given a path then, add inputdir. else leave path alone.
  94                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
  95                                 }
  96                         }
  97
  98                         //check for required parameters
  99                         phylipfile = validParameter.validFile(parameters, "phylip", true);
 100                         if (phylipfile == "not open") { abort = true; }
 101                         else if (phylipfile == "not found") { phylipfile = ""; }
 102                         else {  distfile = phylipfile;  format = "phylip";      }
 103
 104                         columnfile = validParameter.validFile(parameters, "column", true);
 105                         if (columnfile == "not open") { abort = true; }
 106                         else if (columnfile == "not found") { columnfile = ""; }
 107                         else {  distfile = columnfile; format = "column";       }
 108
 109                         namefile = validParameter.validFile(parameters, "name", true);
 110                         if (namefile == "not open") { abort = true; }
 111                         else if (namefile == "not found") { namefile = ""; }
 112
 113                         fastafile = validParameter.validFile(parameters, "fasta", true);
 114                         if (fastafile == "not open") { abort = true; }
 115                         else if (fastafile == "not found") { fastafile = ""; }
 116                         else { distfile = fastafile;  splitmethod = "fasta";  }
 117
 118                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
 119                         if (taxFile == "not open") { abort = true; }
 120                         else if (taxFile == "not found") { taxFile = ""; }
 121
 122                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
 123                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
 124
 125                         if (columnfile != "") {
 126                                 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
 127                         }
 128
 129                         if (fastafile != "") {
 130                                 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
 131                                 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
 132                         }
 133
 134                         //check for optional parameter and set defaults
 135                         // ...at some point should added some additional type checking...
 136                         //get user cutoff and precision or use defaults
 137                         string temp;
 138                         temp = validParameter.validFile(parameters, "precision", false);
 139                         if (temp == "not found") { temp = "100"; }
 140                         //saves precision legnth for formatting below
 141                         length = temp.length();
 142                         convert(temp, precision);
 143
 144                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
 145                         hard = isTrue(temp);
 146
 147                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
 148                         large = isTrue(temp);
 149
 150                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
 151                         convert(temp, processors);
 152
 153                         temp = validParameter.validFile(parameters, "splitmethod", false);
 154                         if (splitmethod != "fasta") {
 155                                 if (temp == "not found")  { splitmethod = "distance"; }
 156                                 else {  splitmethod = temp; }
 157                         }
 158
 159                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
 160                         convert(temp, cutoff);
 161                         cutoff += (5 / (precision * 10.0));
 162
 163                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
 164                         convert(temp, taxLevelCutoff);
 165
 166                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
 167
 168                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
 169                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 170
 171                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
 172                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
 173
 174                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
 175
 176                         showabund = validParameter.validFile(parameters, "showabund", false);
 177                         if (showabund == "not found") { showabund = "T"; }
 178
 179                         timing = validParameter.validFile(parameters, "timing", false);
 180                         if (timing == "not found") { timing = "F"; }
 181
 182                 }
 183         }
 184         catch(exception& e) {
 185                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
 186                 exit(1);
 187         }
 188 }
 189
 190 //**********************************************************************************************************************
 191
 192 void ClusterSplitCommand::help(){
 193         try {
 194                 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
 195                 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
 196                 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
 197                 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n");
 198                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and split the distance file based on those groups. \n");
 199                 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n");
 200                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
 201                 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
 202                 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
 203                 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
 204                 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
 205                 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
 206                 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
 207                 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
 208                 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
 209                 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n");
 210                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
 211                 m->mothurOut("The cluster.split command should be in the following format: \n");
 212                 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
 213                 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
 214
 215         }
 216         catch(exception& e) {
 217                 m->errorOut(e, "ClusterSplitCommand", "help");
 218                 exit(1);
 219         }
 220 }
 221
 222 //**********************************************************************************************************************
 223
 224 ClusterSplitCommand::~ClusterSplitCommand(){}
 225
 226 //**********************************************************************************************************************
 227
 228 int ClusterSplitCommand::execute(){
 229         try {
 230
 231                 if (abort == true) {    return 0;       }
 232                 time_t estart;
 233                 //****************** file prep work ******************************//
 234
 235                 //if user gave a phylip file convert to column file
 236                 if (format == "phylip") {
 237                         estart = time(NULL);
 238                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
 239
 240                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
 241
 242                         NameAssignment* nameMap = NULL;
 243                         convert->setFormat("phylip");
 244                         convert->read(nameMap);
 245
 246                         if (m->control_pressed) {  delete convert;  return 0;  }
 247
 248                         distfile = convert->getOutputFile();
 249
 250                         //if no names file given with phylip file, create it
 251                         ListVector* listToMakeNameFile =  convert->getListVector();
 252                         if (namefile == "") {  //you need to make a namefile for split matrix
 253                                 ofstream out;
 254                                 namefile = phylipfile + ".names";
 255                                 openOutputFile(namefile, out);
 256                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
 257                                         string bin = listToMakeNameFile->get(i);
 258                                         out << bin << '\t' << bin << endl;
 259                                 }
 260                                 out.close();
 261                         }
 262                         delete listToMakeNameFile;
 263                         delete convert;
 264
 265                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
 266                 }
 267                 if (m->control_pressed) { return 0; }
 268
 269                 estart = time(NULL);
 270                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
 271
 272                 //split matrix into non-overlapping groups
 273                 SplitMatrix* split;
 274                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
 275                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
 276                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, splitmethod, processors, outputDir);      }
 277                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
 278
 279                 split->split();
 280
 281                 if (m->control_pressed) { delete split; return 0; }
 282
 283                 string singletonName = split->getSingletonNames();
 284                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
 285                 delete split;
 286
 287                 if (m->control_pressed) { return 0; }
 288
 289                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
 290                 estart = time(NULL);
 291
 292                 //****************** break up files between processes and cluster each file set ******************************//
 293                 vector<string> listFileNames;
 294                 set<string> labels;
 295                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 296                                 if(processors == 1){
 297                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 298                                 }else{
 299                                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
 300                                         dividedNames.resize(processors);
 301
 302                                         //for each file group figure out which process will complete it
 303                                         //want to divide the load intelligently so the big files are spread between processes
 304                                         int count = 1;
 305                                         for (int i = 0; i < distName.size(); i++) {
 306                                                 int processToAssign = (i+1) % processors;
 307                                                 if (processToAssign == 0) { processToAssign = processors; }
 308
 309                                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
 310                                         }
 311
 312                                         //not lets reverse the order of ever other process, so we balance big files running with little ones
 313                                         for (int i = 0; i < processors; i++) {
 314                                                 int remainder = ((i+1) % processors);
 315                                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
 316                                         }
 317
 318                                         createProcesses(dividedNames);
 319
 320                                         if (m->control_pressed) { return 0; }
 321
 322                                         //get list of list file names from each process
 323                                         for(int i=0;i<processors;i++){
 324                                                 string filename = toString(processIDS[i]) + ".temp";
 325                                                 ifstream in;
 326                                                 openInputFile(filename, in);
 327
 328                                                 in >> tag; gobble(in);
 329
 330                                                 while(!in.eof()) {
 331                                                         string tempName;
 332                                                         in >> tempName; gobble(in);
 333                                                         listFileNames.push_back(tempName);
 334                                                 }
 335                                                 in.close();
 336                                                 remove((toString(processIDS[i]) + ".temp").c_str());
 337
 338                                                 //get labels
 339                                                 filename = toString(processIDS[i]) + ".temp.labels";
 340                                                 ifstream in2;
 341                                                 openInputFile(filename, in2);
 342
 343                                                 while(!in2.eof()) {
 344                                                         string tempName;
 345                                                         in2 >> tempName; gobble(in2);
 346                                                         if (labels.count(tempName) == 0) { labels.insert(tempName); }
 347                                                 }
 348                                                 in2.close();
 349                                                 remove((toString(processIDS[i]) + ".temp.labels").c_str());
 350                                         }
 351                                 }
 352                 #else
 353                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 354                 #endif
 355
 356                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
 357
 358                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
 359
 360                 //****************** merge list file and create rabund and sabund files ******************************//
 361                 estart = time(NULL);
 362                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
 363
 364                 ListVector* listSingle;
 365                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 366
 367                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
 368
 369                 mergeLists(listFileNames, labelBins, listSingle);
 370
 371                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
 372
 373                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 374
 375                 m->mothurOutEndLine();
 376                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 377                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
 378                 m->mothurOutEndLine();
 379
 380                 return 0;
 381         }
 382         catch(exception& e) {
 383                 m->errorOut(e, "ClusterSplitCommand", "execute");
 384                 exit(1);
 385         }
 386 }
 387 //**********************************************************************************************************************
 388 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
 389         try {
 390
 391                 map<float, int> labelBin;
 392                 vector<float> orderFloat;
 393                 int numSingleBins;
 394
 395                 //read in singletons
 396                 if (singleton != "none") {
 397                         ifstream in;
 398                         openInputFile(singleton, in);
 399
 400                         string firstCol, secondCol;
 401                         listSingle = new ListVector();
 402                         while (!in.eof()) {
 403                                 in >> firstCol >> secondCol; gobble(in);
 404                                 listSingle->push_back(secondCol);
 405                         }
 406                         in.close();
 407                         remove(singleton.c_str());
 408
 409                         numSingleBins = listSingle->getNumBins();
 410                 }else{  listSingle = NULL; numSingleBins = 0;  }
 411
 412                 //go through users set and make them floats so we can sort them
 413                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
 414                         float temp = -10.0;
 415
 416                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
 417                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
 418
 419                         orderFloat.push_back(temp);
 420                         labelBin[temp] = numSingleBins; //initialize numbins
 421                 }
 422
 423                 //sort order
 424                 sort(orderFloat.begin(), orderFloat.end());
 425                 userLabels.clear();
 426
 427                 //get the list info from each file
 428                 for (int k = 0; k < listNames.size(); k++) {
 429
 430                         if (m->control_pressed) {
 431                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
 432                                 for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
 433                                 return labelBin;
 434                         }
 435
 436                         InputData* input = new InputData(listNames[k], "list");
 437                         ListVector* list = input->getListVector();
 438                         string lastLabel = list->getLabel();
 439
 440                         string filledInList = listNames[k] + "filledInTemp";
 441                         ofstream outFilled;
 442                         openOutputFile(filledInList, outFilled);
 443
 444                         //for each label needed
 445                         for(int l = 0; l < orderFloat.size(); l++){
 446
 447                                 string thisLabel;
 448                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
 449                                 else { thisLabel = toString(orderFloat[l],  length-1);  }
 450
 451                                 //this file has reached the end
 452                                 if (list == NULL) {
 453                                         list = input->getListVector(lastLabel, true);
 454                                 }else{  //do you have the distance, or do you need to fill in
 455
 456                                         float labelFloat;
 457                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
 458                                         else { convert(list->getLabel(), labelFloat); }
 459
 460                                         //check for missing labels
 461                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
 462                                                 //if its bigger get last label, otherwise keep it
 463                                                 delete list;
 464                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
 465                                         }
 466                                         lastLabel = list->getLabel();
 467                                 }
 468
 469                                 //print to new file
 470                                 list->setLabel(thisLabel);
 471                                 list->print(outFilled);
 472
 473                                 //update labelBin
 474                                 labelBin[orderFloat[l]] += list->getNumBins();
 475
 476                                 delete list;
 477
 478                                 list = input->getListVector();
 479                         }
 480
 481                         if (list != NULL) { delete list; }
 482                         delete input;
 483
 484                         outFilled.close();
 485                         remove(listNames[k].c_str());
 486                         rename(filledInList.c_str(), listNames[k].c_str());
 487                 }
 488
 489                 return labelBin;
 490         }
 491         catch(exception& e) {
 492                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
 493                 exit(1);
 494         }
 495 }
 496 //**********************************************************************************************************************
 497 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
 498         try {
 499                 if (outputDir == "") { outputDir += hasPath(distfile); }
 500                 fileroot = outputDir + getRootName(getSimpleName(distfile));
 501
 502                 openOutputFile(fileroot+ tag + ".sabund",       outSabund);
 503                 openOutputFile(fileroot+ tag + ".rabund",       outRabund);
 504                 openOutputFile(fileroot+ tag + ".list",         outList);
 505
 506                 outputNames.push_back(fileroot+ tag + ".sabund");
 507                 outputNames.push_back(fileroot+ tag + ".rabund");
 508                 outputNames.push_back(fileroot+ tag + ".list");
 509
 510                 map<float, int>::iterator itLabel;
 511
 512                 //for each label needed
 513                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
 514
 515                         string thisLabel;
 516                         if (itLabel->first == -1) { thisLabel = "unique"; }
 517                         else { thisLabel = toString(itLabel->first,  length-1);  }
 518
 519                         outList << thisLabel << '\t' << itLabel->second << '\t';
 520
 521                         RAbundVector* rabund = new RAbundVector();
 522                         rabund->setLabel(thisLabel);
 523
 524                         //add in singletons
 525                         if (listSingle != NULL) {
 526                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
 527                                         outList << listSingle->get(j) << '\t';
 528                                         rabund->push_back(getNumNames(listSingle->get(j)));
 529                                 }
 530                         }
 531
 532                         //get the list info from each file
 533                         for (int k = 0; k < listNames.size(); k++) {
 534
 535                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
 536
 537                                 InputData* input = new InputData(listNames[k], "list");
 538                                 ListVector* list = input->getListVector(thisLabel);
 539
 540                                 //this file has reached the end
 541                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }
 542                                 else {
 543                                         for (int j = 0; j < list->getNumBins(); j++) {
 544                                                 outList << list->get(j) << '\t';
 545                                                 rabund->push_back(getNumNames(list->get(j)));
 546                                         }
 547                                         delete list;
 548                                 }
 549                                 delete input;
 550                         }
 551
 552                         SAbundVector sabund = rabund->getSAbundVector();
 553
 554                         sabund.print(outSabund);
 555                         rabund->print(outRabund);
 556                         outList << endl;
 557
 558                         delete rabund;
 559                 }
 560
 561                 outList.close();
 562                 outRabund.close();
 563                 outSabund.close();
 564
 565                 if (listSingle != NULL) { delete listSingle;  }
 566
 567                 for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
 568
 569                 return 0;
 570         }
 571         catch(exception& e) {
 572                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
 573                 exit(1);
 574         }
 575 }
 576
 577 //**********************************************************************************************************************
 578
 579 void ClusterSplitCommand::printData(ListVector* oldList){
 580         try {
 581                 string label = oldList->getLabel();
 582                 RAbundVector oldRAbund = oldList->getRAbundVector();
 583
 584                 oldRAbund.setLabel(label);
 585                 if (isTrue(showabund)) {
 586                         oldRAbund.getSAbundVector().print(cout);
 587                 }
 588                 oldRAbund.print(outRabund);
 589                 oldRAbund.getSAbundVector().print(outSabund);
 590
 591                 oldList->print(outList);
 592         }
 593         catch(exception& e) {
 594                 m->errorOut(e, "ClusterSplitCommand", "printData");
 595                 exit(1);
 596         }
 597 }
 598 //**********************************************************************************************************************
 599 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
 600         try {
 601
 602         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
 603                 int process = 0;
 604                 int exitCommand = 1;
 605                 processIDS.clear();
 606
 607                 //loop through and create all the processes you want
 608                 while (process != processors) {
 609                         int pid = fork();
 610
 611                         if (pid > 0) {
 612                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 613                                 process++;
 614                         }else if (pid == 0){
 615                                 set<string> labels;
 616                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
 617
 618                                 //write out names to file
 619                                 string filename = toString(getpid()) + ".temp";
 620                                 ofstream out;
 621                                 openOutputFile(filename, out);
 622                                 out << tag << endl;
 623                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
 624                                 out.close();
 625
 626                                 //print out labels
 627                                 ofstream outLabels;
 628                                 filename = toString(getpid()) + ".temp.labels";
 629                                 openOutputFile(filename, outLabels);
 630
 631                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
 632                                         outLabels << (*it) << endl;
 633                                 }
 634                                 outLabels.close();
 635
 636                                 exit(0);
 637                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
 638                 }
 639
 640                 //force parent to wait until all the processes are done
 641                 for (int i=0;i<processors;i++) {
 642                         int temp = processIDS[i];
 643                         wait(&temp);
 644                 }
 645
 646                 return exitCommand;
 647         #endif
 648
 649         }
 650         catch(exception& e) {
 651                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
 652                 exit(1);
 653         }
 654 }
 655 //**********************************************************************************************************************
 656
 657 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
 658         try {
 659                 Cluster* cluster;
 660                 SparseMatrix* matrix;
 661                 ListVector* list;
 662                 ListVector oldList;
 663                 RAbundVector* rabund;
 664
 665                 vector<string> listFileNames;
 666
 667                 //cluster each distance file
 668                 for (int i = 0; i < distNames.size(); i++) {
 669
 670                         string thisNamefile = distNames[i].begin()->second;
 671                         string thisDistFile = distNames[i].begin()->first;
 672
 673                         //read in distance file
 674                         globaldata->setNameFile(thisNamefile);
 675                         globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
 676
 677                         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
 678
 679                         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
 680                         read->setCutoff(cutoff);
 681
 682                         NameAssignment* nameMap = new NameAssignment(thisNamefile);
 683                         nameMap->readMap();
 684                         read->read(nameMap);
 685
 686                         if (m->control_pressed) {  delete read; delete nameMap; return listFileNames; }
 687
 688                         list = read->getListVector();
 689                         oldList = *list;
 690                         matrix = read->getMatrix();
 691
 692                         delete read;
 693                         delete nameMap;
 694
 695                         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
 696
 697                         rabund = new RAbundVector(list->getRAbundVector());
 698
 699                         //create cluster
 700                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
 701                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
 702                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
 703                         tag = cluster->getTag();
 704
 705                         if (outputDir == "") { outputDir += hasPath(thisDistFile); }
 706                         fileroot = outputDir + getRootName(getSimpleName(thisDistFile));
 707
 708                         ofstream listFile;
 709                         openOutputFile(fileroot+ tag + ".list", listFile);
 710
 711                         listFileNames.push_back(fileroot+ tag + ".list");
 712
 713                         time_t estart = time(NULL);
 714
 715                         float previousDist = 0.00000;
 716                         float rndPreviousDist = 0.00000;
 717
 718                         oldList = *list;
 719
 720                         print_start = true;
 721                         start = time(NULL);
 722                         double saveCutoff = cutoff;
 723
 724                         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
 725
 726                                 if (m->control_pressed) { //clean up
 727                                         delete matrix; delete list;     delete cluster; delete rabund;
 728                                         listFile.close();
 729                                         for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
 730                                         listFileNames.clear(); return listFileNames;
 731                                 }
 732
 733                                 cluster->update(cutoff);
 734
 735                                 float dist = matrix->getSmallDist();
 736                                 float rndDist;
 737                                 if (hard) {
 738                                         rndDist = ceilDist(dist, precision);
 739                                 }else{
 740                                         rndDist = roundDist(dist, precision);
 741                                 }
 742
 743                                 if(previousDist <= 0.0000 && dist != previousDist){
 744                                         oldList.setLabel("unique");
 745                                         oldList.print(listFile);
 746                                         if (labels.count("unique") == 0) {  labels.insert("unique");  }
 747                                 }
 748                                 else if(rndDist != rndPreviousDist){
 749                                         oldList.setLabel(toString(rndPreviousDist,  length-1));
 750                                         oldList.print(listFile);
 751                                         if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
 752                                 }
 753
 754                                 previousDist = dist;
 755                                 rndPreviousDist = rndDist;
 756                                 oldList = *list;
 757                         }
 758
 759
 760                         if(previousDist <= 0.0000){
 761                                 oldList.setLabel("unique");
 762                                 oldList.print(listFile);
 763                                 if (labels.count("unique") == 0) { labels.insert("unique"); }
 764                         }
 765                         else if(rndPreviousDist<cutoff){
 766                                 oldList.setLabel(toString(rndPreviousDist,  length-1));
 767                                 oldList.print(listFile);
 768                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
 769                         }
 770
 771                         delete matrix; delete list;     delete cluster; delete rabund;
 772                         listFile.close();
 773
 774                         if (m->control_pressed) { //clean up
 775                                 for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
 776                                 listFileNames.clear(); return listFileNames;
 777                         }
 778
 779                         remove(thisDistFile.c_str());
 780                         remove(thisNamefile.c_str());
 781                 }
 782
 783
 784                 return listFileNames;
 785
 786         }
 787         catch(exception& e) {
 788                 m->errorOut(e, "ClusterSplitCommand", "cluster");
 789                 exit(1);
 790         }
 791
 792
 793 }
 794
 795 //**********************************************************************************************************************