]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
added mpi code to cluster.split command
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
17
18 //**********************************************************************************************************************
19 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
20 ClusterSplitCommand::ClusterSplitCommand(string option)  {
21         try{
22                 globaldata = GlobalData::getInstance();
23                 abort = false;
24                 format = "";
25                 
26                 //allow user to run help
27                 if(option == "help") { help(); abort = true; }
28                 
29                 else {
30                         //valid paramters for this command
31                         string Array[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
32                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
33                         
34                         OptionParser parser(option);
35                         map<string,string> parameters = parser.getParameters();
36                         
37                         ValidParameters validParameter;
38                 
39                         //check to make sure all parameters are valid for command
40                         map<string,string>::iterator it;
41                         for (it = parameters.begin(); it != parameters.end(); it++) { 
42                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
43                                         abort = true;
44                                 }
45                         }
46                         
47                         globaldata->newRead();
48                         
49                         //if the user changes the output directory command factory will send this info to us in the output parameter 
50                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
51                         
52                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
53                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
54                         if (inputDir == "not found"){   inputDir = "";          }
55                         else {
56                                 string path;
57                                 it = parameters.find("phylip");
58                                 //user has given a template file
59                                 if(it != parameters.end()){ 
60                                         path = hasPath(it->second);
61                                         //if the user has not given a path then, add inputdir. else leave path alone.
62                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
63                                 }
64                                 
65                                 it = parameters.find("column");
66                                 //user has given a template file
67                                 if(it != parameters.end()){ 
68                                         path = hasPath(it->second);
69                                         //if the user has not given a path then, add inputdir. else leave path alone.
70                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
71                                 }
72                                 
73                                 it = parameters.find("name");
74                                 //user has given a template file
75                                 if(it != parameters.end()){ 
76                                         path = hasPath(it->second);
77                                         //if the user has not given a path then, add inputdir. else leave path alone.
78                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
79                                 }
80                                 
81                                 it = parameters.find("taxonomy");
82                                 //user has given a template file
83                                 if(it != parameters.end()){ 
84                                         path = hasPath(it->second);
85                                         //if the user has not given a path then, add inputdir. else leave path alone.
86                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
87                                 }
88                                 
89                                 it = parameters.find("fasta");
90                                 //user has given a template file
91                                 if(it != parameters.end()){ 
92                                         path = hasPath(it->second);
93                                         //if the user has not given a path then, add inputdir. else leave path alone.
94                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
95                                 }
96                         }
97                         
98                         //check for required parameters
99                         phylipfile = validParameter.validFile(parameters, "phylip", true);
100                         if (phylipfile == "not open") { abort = true; }
101                         else if (phylipfile == "not found") { phylipfile = ""; }        
102                         else {  distfile = phylipfile;  format = "phylip";      }
103                         
104                         columnfile = validParameter.validFile(parameters, "column", true);
105                         if (columnfile == "not open") { abort = true; } 
106                         else if (columnfile == "not found") { columnfile = ""; }
107                         else {  distfile = columnfile; format = "column";       }
108                         
109                         namefile = validParameter.validFile(parameters, "name", true);
110                         if (namefile == "not open") { abort = true; }   
111                         else if (namefile == "not found") { namefile = ""; }
112                         
113                         fastafile = validParameter.validFile(parameters, "fasta", true);
114                         if (fastafile == "not open") { abort = true; }  
115                         else if (fastafile == "not found") { fastafile = ""; }
116                         else { distfile = fastafile;  splitmethod = "fasta";  }
117                         
118                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
119                         if (taxFile == "not open") { abort = true; }    
120                         else if (taxFile == "not found") { taxFile = ""; }
121                         
122                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
123                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
124                 
125                         if (columnfile != "") {
126                                 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
127                         }
128                         
129                         if (fastafile != "") {
130                                 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
131                                 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
132                         }
133                                         
134                         //check for optional parameter and set defaults
135                         // ...at some point should added some additional type checking...
136                         //get user cutoff and precision or use defaults
137                         string temp;
138                         temp = validParameter.validFile(parameters, "precision", false);
139                         if (temp == "not found") { temp = "100"; }
140                         //saves precision legnth for formatting below
141                         length = temp.length();
142                         convert(temp, precision); 
143                         
144                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
145                         hard = isTrue(temp);
146                         
147                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
148                         large = isTrue(temp);
149                         
150                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
151                         convert(temp, processors); 
152                         
153                         temp = validParameter.validFile(parameters, "splitmethod", false);      
154                         if (splitmethod != "fasta") {
155                                 if (temp == "not found")  { splitmethod = "distance"; }
156                                 else {  splitmethod = temp; }
157                         }
158                         
159                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
160                         convert(temp, cutoff); 
161                         cutoff += (5 / (precision * 10.0));  
162                         
163                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
164                         convert(temp, taxLevelCutoff); 
165                         
166                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
167                         
168                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
169                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
170                         
171                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
172                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
173                         
174                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
175
176                         showabund = validParameter.validFile(parameters, "showabund", false);
177                         if (showabund == "not found") { showabund = "T"; }
178
179                         timing = validParameter.validFile(parameters, "timing", false);
180                         if (timing == "not found") { timing = "F"; }
181                         
182                 }
183         }
184         catch(exception& e) {
185                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
186                 exit(1);
187         }
188 }
189
190 //**********************************************************************************************************************
191
192 void ClusterSplitCommand::help(){
193         try {
194                 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
195                 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
196                 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
197                 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n");
198                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and split the distance file based on those groups. \n");
199                 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n");
200                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
201                 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
202                 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
203                 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
204                 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
205                 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
206                 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
207                 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
208                 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
209                 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n");
210                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
211                 #ifdef USE_MPI
212                 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
213                 #endif
214                 m->mothurOut("The cluster.split command should be in the following format: \n");
215                 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
216                 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");       
217
218         }
219         catch(exception& e) {
220                 m->errorOut(e, "ClusterSplitCommand", "help");
221                 exit(1);
222         }
223 }
224
225 //**********************************************************************************************************************
226
227 ClusterSplitCommand::~ClusterSplitCommand(){}
228
229 //**********************************************************************************************************************
230
231 int ClusterSplitCommand::execute(){
232         try {
233         
234                 if (abort == true) {    return 0;       }
235                 
236                 time_t estart;
237                 vector<string> listFileNames;
238                 set<string> labels;
239                 string singletonName = "";
240
241                 //****************** file prep work ******************************//
242                 #ifdef USE_MPI
243                         int pid;
244                         int tag = 2001;
245                         MPI_Status status; 
246                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
247                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
248                         
249                         if (pid == 0) { //only process 0 converts and splits
250                         
251                 #endif
252                 
253                 //if user gave a phylip file convert to column file
254                 if (format == "phylip") {
255                         estart = time(NULL);
256                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
257                         
258                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
259                         
260                         NameAssignment* nameMap = NULL;
261                         convert->setFormat("phylip");
262                         convert->read(nameMap);
263                         
264                         if (m->control_pressed) {  delete convert;  return 0;  }
265                         
266                         distfile = convert->getOutputFile();
267                 
268                         //if no names file given with phylip file, create it
269                         ListVector* listToMakeNameFile =  convert->getListVector();
270                         if (namefile == "") {  //you need to make a namefile for split matrix
271                                 ofstream out;
272                                 namefile = phylipfile + ".names";
273                                 openOutputFile(namefile, out);
274                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
275                                         string bin = listToMakeNameFile->get(i);
276                                         out << bin << '\t' << bin << endl;
277                                 }
278                                 out.close();
279                         }
280                         delete listToMakeNameFile;
281                         delete convert;
282                         
283                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
284                 }
285                 if (m->control_pressed) { return 0; }
286                 
287                 estart = time(NULL);
288                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
289                 
290                 //split matrix into non-overlapping groups
291                 SplitMatrix* split;
292                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
293                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
294                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, splitmethod, processors, outputDir);      }
295                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
296                 
297                 split->split();
298                 
299                 if (m->control_pressed) { delete split; return 0; }
300                 
301                 singletonName = split->getSingletonNames();
302                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
303                 delete split;
304                 
305                 if (m->control_pressed) { return 0; }
306                 
307                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
308                 estart = time(NULL);
309                 
310                 //****************** break up files between processes and cluster each file set ******************************//
311         #ifdef USE_MPI
312                         ////you are process 0 from above////
313                         
314                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...                           
315                         dividedNames.resize(processors);
316                                         
317                         //for each file group figure out which process will complete it
318                         //want to divide the load intelligently so the big files are spread between processes
319                         int count = 1;
320                         for (int i = 0; i < distName.size(); i++) { 
321                                 int processToAssign = (i+1) % processors; 
322                                 if (processToAssign == 0) { processToAssign = processors; }
323                                                 
324                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
325                         }
326                                         
327                         //not lets reverse the order of ever other process, so we balance big files running with little ones
328                         for (int i = 0; i < processors; i++) {
329                                 int remainder = ((i+1) % processors);
330                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
331                         }
332                         
333                         
334                         //send each child the list of files it needs to process
335                         for(int i = 1; i < processors; i++) { 
336                                 //send number of file pairs
337                                 int num = dividedNames[i].size();
338                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
339                                 
340                                 for (int j = 0; j < num; j++) { //send filenames to process i
341                                         char tempDistFileName[1024];
342                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
343                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
344                                         
345                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
346                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
347                                         
348                                         char tempNameFileName[1024];
349                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
350                                         int lengthName = (dividedNames[i][j].begin()->second).length();
351
352                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
353                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
354                                 }
355                         }
356                         
357                         //process your share
358                         listFileNames = cluster(dividedNames[0], labels);
359                         
360                         //receive the other processes info
361                         for(int i = 1; i < processors; i++) { 
362                                 int num = dividedNames[i].size();
363                                 
364                                 //send list filenames to root process
365                                 for (int j = 0; j < num; j++) {  
366                                         int lengthList = 0;
367                                         char tempListFileName[1024];
368                                 
369                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
370                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
371                                 
372                                         string myListFileName = tempListFileName;
373                                         myListFileName = myListFileName.substr(0, lengthList);
374                                         
375                                         listFileNames.push_back(myListFileName);
376                                 }
377                                 
378                                 //send Labels to root process
379                                 int numLabels = 0;
380                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
381                                 
382                                 for (int j = 0; j < numLabels; j++) {  
383                                         int lengthLabel = 0;
384                                         char tempLabel[100];
385                                 
386                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
387                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
388                                 
389                                         string myLabel = tempLabel;
390                                         myLabel = myLabel.substr(0, lengthLabel);
391                         
392                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
393                                 }
394                         }
395                         
396                 }else { //you are a child process
397                         vector < map<string, string> >  myNames;
398                         
399                         //recieve the files you need to process
400                         //receive number of file pairs
401                         int num = 0;
402                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
403                         
404                         myNames.resize(num);
405         
406                         for (int j = 0; j < num; j++) { //receive filenames to process 
407                                 int lengthDist = 0;
408                                 char tempDistFileName[1024];
409                                 
410                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
411                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
412                                 
413                                 string myDistFileName = tempDistFileName;
414                                 myDistFileName = myDistFileName.substr(0, lengthDist);
415                         
416                                 int lengthName = 0;
417                                 char tempNameFileName[1024];
418                                 
419                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
420                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
421                                 
422                                 string myNameFileName = tempNameFileName;
423                                 myNameFileName = myNameFileName.substr(0, lengthName);
424                                 
425                                 //save file name
426                                 myNames[j][myDistFileName] = myNameFileName;
427                         }
428         
429                         //process them
430                         listFileNames = cluster(myNames, labels);
431                         
432                         //send list filenames to root process
433                         for (int j = 0; j < num; j++) {  
434                                 char tempListFileName[1024];
435                                 strcpy(tempListFileName, listFileNames[j].c_str());
436                                 int lengthList = listFileNames[j].length();
437                                         
438                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
439                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
440                         }
441                         
442                         //send Labels to root process
443                         int numLabels = labels.size();
444                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
445                         
446                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
447                                 char tempLabel[100];
448                                 strcpy(tempLabel, (*it).c_str());
449                                 int lengthLabel = (*it).length();
450                                         
451                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
452                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
453                         }
454                 }
455                 
456                 //make everyone wait
457                 MPI_Barrier(MPI_COMM_WORLD);
458                 
459         #else
460
461                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
462                                 if(processors == 1){
463                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
464                                 }else{
465                                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
466                                         dividedNames.resize(processors);
467                                         
468                                         //for each file group figure out which process will complete it
469                                         //want to divide the load intelligently so the big files are spread between processes
470                                         int count = 1;
471                                         for (int i = 0; i < distName.size(); i++) { 
472                                                 int processToAssign = (i+1) % processors; 
473                                                 if (processToAssign == 0) { processToAssign = processors; }
474                                                 
475                                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
476                                         }
477                                         
478                                         //not lets reverse the order of ever other process, so we balance big files running with little ones
479                                         for (int i = 0; i < processors; i++) {
480                                                 int remainder = ((i+1) % processors);
481                                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
482                                         }
483                                         
484                                         createProcesses(dividedNames);
485                                                         
486                                         if (m->control_pressed) { return 0; }
487
488                                         //get list of list file names from each process
489                                         for(int i=0;i<processors;i++){
490                                                 string filename = toString(processIDS[i]) + ".temp";
491                                                 ifstream in;
492                                                 openInputFile(filename, in);
493                                                 
494                                                 in >> tag; gobble(in);
495                                                 
496                                                 while(!in.eof()) {
497                                                         string tempName;
498                                                         in >> tempName; gobble(in);
499                                                         listFileNames.push_back(tempName);
500                                                 }
501                                                 in.close();
502                                                 remove((toString(processIDS[i]) + ".temp").c_str());
503                                                 
504                                                 //get labels
505                                                 filename = toString(processIDS[i]) + ".temp.labels";
506                                                 ifstream in2;
507                                                 openInputFile(filename, in2);
508                                                 
509                                                 while(!in2.eof()) {
510                                                         string tempName;
511                                                         in2 >> tempName; gobble(in2);
512                                                         if (labels.count(tempName) == 0) { labels.insert(tempName); }
513                                                 }
514                                                 in2.close();
515                                                 remove((toString(processIDS[i]) + ".temp.labels").c_str());
516                                         }
517                                 }
518                 #else
519                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
520                 #endif
521         #endif  
522                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
523                 
524                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
525                 
526                 //****************** merge list file and create rabund and sabund files ******************************//
527                 estart = time(NULL);
528                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
529                 
530                 #ifdef USE_MPI
531                         if (pid == 0) { //only process 0 merges
532                 #endif
533
534                 ListVector* listSingle;
535                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
536                 
537                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
538                 
539                 mergeLists(listFileNames, labelBins, listSingle);
540
541                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
542                 
543                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
544                 
545                 m->mothurOutEndLine();
546                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
547                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
548                 m->mothurOutEndLine();
549                 
550                 #ifdef USE_MPI
551                         } //only process 0 merges
552                         
553                         //make everyone wait
554                         MPI_Barrier(MPI_COMM_WORLD);
555                 #endif
556
557                 return 0;
558         }
559         catch(exception& e) {
560                 m->errorOut(e, "ClusterSplitCommand", "execute");
561                 exit(1);
562         }
563 }
564 //**********************************************************************************************************************
565 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
566         try {
567                                 
568                 map<float, int> labelBin;
569                 vector<float> orderFloat;
570                 int numSingleBins;
571                 
572                 //read in singletons
573                 if (singleton != "none") {
574                         ifstream in;
575                         openInputFile(singleton, in);
576                                 
577                         string firstCol, secondCol;
578                         listSingle = new ListVector();
579                         while (!in.eof()) {
580                                 in >> firstCol >> secondCol; gobble(in);
581                                 listSingle->push_back(secondCol);
582                         }
583                         in.close();
584                         remove(singleton.c_str());
585                         
586                         numSingleBins = listSingle->getNumBins();
587                 }else{  listSingle = NULL; numSingleBins = 0;  }
588                 
589                 //go through users set and make them floats so we can sort them 
590                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
591                         float temp = -10.0;
592
593                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
594                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
595                                                 
596                         orderFloat.push_back(temp);
597                         labelBin[temp] = numSingleBins; //initialize numbins 
598                 }
599         
600                 //sort order
601                 sort(orderFloat.begin(), orderFloat.end());
602                 userLabels.clear();
603                         
604                 //get the list info from each file
605                 for (int k = 0; k < listNames.size(); k++) {
606         
607                         if (m->control_pressed) {  
608                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
609                                 for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
610                                 return labelBin;
611                         }
612                         
613                         InputData* input = new InputData(listNames[k], "list");
614                         ListVector* list = input->getListVector();
615                         string lastLabel = list->getLabel();
616                         
617                         string filledInList = listNames[k] + "filledInTemp";
618                         ofstream outFilled;
619                         openOutputFile(filledInList, outFilled);
620         
621                         //for each label needed
622                         for(int l = 0; l < orderFloat.size(); l++){
623                         
624                                 string thisLabel;
625                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
626                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
627
628                                 //this file has reached the end
629                                 if (list == NULL) { 
630                                         list = input->getListVector(lastLabel, true); 
631                                 }else{  //do you have the distance, or do you need to fill in
632                                                 
633                                         float labelFloat;
634                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
635                                         else { convert(list->getLabel(), labelFloat); }
636
637                                         //check for missing labels
638                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
639                                                 //if its bigger get last label, otherwise keep it
640                                                 delete list;
641                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
642                                         }
643                                         lastLabel = list->getLabel();
644                                 }
645                                 
646                                 //print to new file
647                                 list->setLabel(thisLabel);
648                                 list->print(outFilled);
649                 
650                                 //update labelBin
651                                 labelBin[orderFloat[l]] += list->getNumBins();
652                                                                         
653                                 delete list;
654                                                                         
655                                 list = input->getListVector();
656                         }
657                         
658                         if (list != NULL) { delete list; }
659                         delete input;
660                         
661                         outFilled.close();
662                         remove(listNames[k].c_str());
663                         rename(filledInList.c_str(), listNames[k].c_str());
664                 }
665                 
666                 return labelBin;
667         }
668         catch(exception& e) {
669                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
670                 exit(1);
671         }
672 }
673 //**********************************************************************************************************************
674 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
675         try {
676                 if (outputDir == "") { outputDir += hasPath(distfile); }
677                 fileroot = outputDir + getRootName(getSimpleName(distfile));
678                 
679                 openOutputFile(fileroot+ tag + ".sabund",       outSabund);
680                 openOutputFile(fileroot+ tag + ".rabund",       outRabund);
681                 openOutputFile(fileroot+ tag + ".list",         outList);
682                                 
683                 outputNames.push_back(fileroot+ tag + ".sabund");
684                 outputNames.push_back(fileroot+ tag + ".rabund");
685                 outputNames.push_back(fileroot+ tag + ".list");
686                 
687                 map<float, int>::iterator itLabel;
688
689                 //for each label needed
690                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
691                         
692                         string thisLabel;
693                         if (itLabel->first == -1) { thisLabel = "unique"; }
694                         else { thisLabel = toString(itLabel->first,  length-1);  } 
695                         
696                         outList << thisLabel << '\t' << itLabel->second << '\t';
697
698                         RAbundVector* rabund = new RAbundVector();
699                         rabund->setLabel(thisLabel);
700
701                         //add in singletons
702                         if (listSingle != NULL) {
703                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
704                                         outList << listSingle->get(j) << '\t';
705                                         rabund->push_back(getNumNames(listSingle->get(j)));
706                                 }
707                         }
708                         
709                         //get the list info from each file
710                         for (int k = 0; k < listNames.size(); k++) {
711         
712                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
713                                 
714                                 InputData* input = new InputData(listNames[k], "list");
715                                 ListVector* list = input->getListVector(thisLabel);
716                                 
717                                 //this file has reached the end
718                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
719                                 else {          
720                                         for (int j = 0; j < list->getNumBins(); j++) {
721                                                 outList << list->get(j) << '\t';
722                                                 rabund->push_back(getNumNames(list->get(j)));
723                                         }
724                                         delete list;
725                                 }
726                                 delete input;
727                         }
728                         
729                         SAbundVector sabund = rabund->getSAbundVector();
730                         
731                         sabund.print(outSabund);
732                         rabund->print(outRabund);
733                         outList << endl;
734                         
735                         delete rabund;
736                 }
737                 
738                 outList.close();
739                 outRabund.close();
740                 outSabund.close();
741                 
742                 if (listSingle != NULL) { delete listSingle;  }
743                 
744                 for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
745                 
746                 return 0;
747         }
748         catch(exception& e) {
749                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
750                 exit(1);
751         }
752 }
753
754 //**********************************************************************************************************************
755
756 void ClusterSplitCommand::printData(ListVector* oldList){
757         try {
758                 string label = oldList->getLabel();
759                 RAbundVector oldRAbund = oldList->getRAbundVector();
760                 
761                 oldRAbund.setLabel(label);
762                 if (isTrue(showabund)) {
763                         oldRAbund.getSAbundVector().print(cout);
764                 }
765                 oldRAbund.print(outRabund);
766                 oldRAbund.getSAbundVector().print(outSabund);
767         
768                 oldList->print(outList);
769         }
770         catch(exception& e) {
771                 m->errorOut(e, "ClusterSplitCommand", "printData");
772                 exit(1);
773         }
774 }
775 //**********************************************************************************************************************
776 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
777         try {
778         
779         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
780                 int process = 0;
781                 int exitCommand = 1;
782                 processIDS.clear();
783                 
784                 //loop through and create all the processes you want
785                 while (process != processors) {
786                         int pid = fork();
787                         
788                         if (pid > 0) {
789                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
790                                 process++;
791                         }else if (pid == 0){
792                                 set<string> labels;
793                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
794                                 
795                                 //write out names to file
796                                 string filename = toString(getpid()) + ".temp";
797                                 ofstream out;
798                                 openOutputFile(filename, out);
799                                 out << tag << endl;
800                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
801                                 out.close();
802                                 
803                                 //print out labels
804                                 ofstream outLabels;
805                                 filename = toString(getpid()) + ".temp.labels";
806                                 openOutputFile(filename, outLabels);
807                 
808                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
809                                         outLabels << (*it) << endl;
810                                 }
811                                 outLabels.close();
812
813                                 exit(0);
814                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
815                 }
816                 
817                 //force parent to wait until all the processes are done
818                 for (int i=0;i<processors;i++) { 
819                         int temp = processIDS[i];
820                         wait(&temp);
821                 }
822                 
823                 return exitCommand;
824         #endif          
825         
826         }
827         catch(exception& e) {
828                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
829                 exit(1);
830         }
831 }
832 //**********************************************************************************************************************
833
834 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
835         try {
836                 Cluster* cluster;
837                 SparseMatrix* matrix;
838                 ListVector* list;
839                 ListVector oldList;
840                 RAbundVector* rabund;
841                 
842                 vector<string> listFileNames;
843                 
844                 //cluster each distance file
845                 for (int i = 0; i < distNames.size(); i++) {
846                         
847                         string thisNamefile = distNames[i].begin()->second;
848                         string thisDistFile = distNames[i].begin()->first;
849                         
850                         //read in distance file
851                         globaldata->setNameFile(thisNamefile);
852                         globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
853                         
854                         #ifdef USE_MPI
855                                 int pid;
856                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
857                                 
858                                 //output your files too
859                                 if (pid != 0) {
860                                         cout << endl << "Reading " << thisDistFile << endl;
861                                 }
862                         #endif
863                         
864                         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
865                         
866                         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
867                         read->setCutoff(cutoff);
868
869                         NameAssignment* nameMap = new NameAssignment(thisNamefile);
870                         nameMap->readMap();
871                         read->read(nameMap);
872                         
873                         if (m->control_pressed) {  delete read; delete nameMap; return listFileNames; }
874                         
875                         list = read->getListVector();
876                         oldList = *list;
877                         matrix = read->getMatrix();
878                         
879                         delete read; 
880                         delete nameMap; 
881                         
882                         
883                         #ifdef USE_MPI
884                                 //output your files too
885                                 if (pid != 0) {
886                                         cout << endl << "Clustering " << thisDistFile << endl;
887                                 }
888                         #endif
889                         
890                         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
891                 
892                         rabund = new RAbundVector(list->getRAbundVector());
893                         
894                         //create cluster
895                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
896                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
897                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
898                         tag = cluster->getTag();
899                 
900                         if (outputDir == "") { outputDir += hasPath(thisDistFile); }
901                         fileroot = outputDir + getRootName(getSimpleName(thisDistFile));
902                         
903                         ofstream listFile;
904                         openOutputFile(fileroot+ tag + ".list", listFile);
905                 
906                         listFileNames.push_back(fileroot+ tag + ".list");
907                 
908                         time_t estart = time(NULL);
909                         
910                         float previousDist = 0.00000;
911                         float rndPreviousDist = 0.00000;
912                         
913                         oldList = *list;
914
915                         print_start = true;
916                         start = time(NULL);
917                         double saveCutoff = cutoff;
918                 
919                         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
920                 
921                                 if (m->control_pressed) { //clean up
922                                         delete matrix; delete list;     delete cluster; delete rabund;
923                                         listFile.close();
924                                         for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
925                                         listFileNames.clear(); return listFileNames;
926                                 }
927                 
928                                 cluster->update(cutoff);
929         
930                                 float dist = matrix->getSmallDist();
931                                 float rndDist;
932                                 if (hard) {
933                                         rndDist = ceilDist(dist, precision); 
934                                 }else{
935                                         rndDist = roundDist(dist, precision); 
936                                 }
937
938                                 if(previousDist <= 0.0000 && dist != previousDist){
939                                         oldList.setLabel("unique");
940                                         oldList.print(listFile);
941                                         if (labels.count("unique") == 0) {  labels.insert("unique");  }
942                                 }
943                                 else if(rndDist != rndPreviousDist){
944                                         oldList.setLabel(toString(rndPreviousDist,  length-1));
945                                         oldList.print(listFile);
946                                         if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
947                                 }
948                 
949                                 previousDist = dist;
950                                 rndPreviousDist = rndDist;
951                                 oldList = *list;
952                         }
953
954                 
955                         if(previousDist <= 0.0000){
956                                 oldList.setLabel("unique");
957                                 oldList.print(listFile);
958                                 if (labels.count("unique") == 0) { labels.insert("unique"); }
959                         }
960                         else if(rndPreviousDist<cutoff){
961                                 oldList.setLabel(toString(rndPreviousDist,  length-1));
962                                 oldList.print(listFile);
963                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
964                         }
965         
966                         delete matrix; delete list;     delete cluster; delete rabund; 
967                         listFile.close();
968                         
969                         if (m->control_pressed) { //clean up
970                                 for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
971                                 listFileNames.clear(); return listFileNames;
972                         }
973                         
974                         remove(thisDistFile.c_str());
975                         remove(thisNamefile.c_str());
976                 }
977                 
978                                         
979                 return listFileNames;
980         
981         }
982         catch(exception& e) {
983                 m->errorOut(e, "ClusterSplitCommand", "cluster");
984                 exit(1);
985         }
986
987
988 }
989
990 //**********************************************************************************************************************