]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
fixes while testing
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
17
18
19 //**********************************************************************************************************************
20 vector<string> ClusterSplitCommand::getValidParameters(){       
21         try {
22                 string AlignArray[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
23                 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
24                 return myArray;
25         }
26         catch(exception& e) {
27                 m->errorOut(e, "ClusterSplitCommand", "getValidParameters");
28                 exit(1);
29         }
30 }
31 //**********************************************************************************************************************
32 ClusterSplitCommand::ClusterSplitCommand(){     
33         try {
34                 abort = true;
35                 //initialize outputTypes
36                 vector<string> tempOutNames;
37                 outputTypes["list"] = tempOutNames;
38                 outputTypes["rabund"] = tempOutNames;
39                 outputTypes["sabund"] = tempOutNames;
40         }
41         catch(exception& e) {
42                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
43                 exit(1);
44         }
45 }
46 //**********************************************************************************************************************
47 vector<string> ClusterSplitCommand::getRequiredParameters(){    
48         try {
49                 string Array[] =  {"fasta","phylip","column","or"};
50                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
51                 return myArray;
52         }
53         catch(exception& e) {
54                 m->errorOut(e, "ClusterSplitCommand", "getRequiredParameters");
55                 exit(1);
56         }
57 }
58 //**********************************************************************************************************************
59 vector<string> ClusterSplitCommand::getRequiredFiles(){ 
60         try {
61                 vector<string> myArray;
62                 return myArray;
63         }
64         catch(exception& e) {
65                 m->errorOut(e, "ClusterSplitCommand", "getRequiredFiles");
66                 exit(1);
67         }
68 }
69 //**********************************************************************************************************************
70 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
71 ClusterSplitCommand::ClusterSplitCommand(string option)  {
72         try{
73                 globaldata = GlobalData::getInstance();
74                 abort = false;
75                 format = "";
76                 
77                 //allow user to run help
78                 if(option == "help") { help(); abort = true; }
79                 
80                 else {
81                         //valid paramters for this command
82                         string Array[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
83                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
84                         
85                         OptionParser parser(option);
86                         map<string,string> parameters = parser.getParameters();
87                         
88                         ValidParameters validParameter("cluster.split");
89                 
90                         //check to make sure all parameters are valid for command
91                         map<string,string>::iterator it;
92                         for (it = parameters.begin(); it != parameters.end(); it++) { 
93                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
94                                         abort = true;
95                                 }
96                         }
97                         
98                         //initialize outputTypes
99                         vector<string> tempOutNames;
100                         outputTypes["list"] = tempOutNames;
101                         outputTypes["rabund"] = tempOutNames;
102                         outputTypes["sabund"] = tempOutNames;
103                         
104                         globaldata->newRead();
105                         
106                         //if the user changes the output directory command factory will send this info to us in the output parameter 
107                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
108                         
109                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
110                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
111                         if (inputDir == "not found"){   inputDir = "";          }
112                         else {
113                                 string path;
114                                 it = parameters.find("phylip");
115                                 //user has given a template file
116                                 if(it != parameters.end()){ 
117                                         path = m->hasPath(it->second);
118                                         //if the user has not given a path then, add inputdir. else leave path alone.
119                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
120                                 }
121                                 
122                                 it = parameters.find("column");
123                                 //user has given a template file
124                                 if(it != parameters.end()){ 
125                                         path = m->hasPath(it->second);
126                                         //if the user has not given a path then, add inputdir. else leave path alone.
127                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
128                                 }
129                                 
130                                 it = parameters.find("name");
131                                 //user has given a template file
132                                 if(it != parameters.end()){ 
133                                         path = m->hasPath(it->second);
134                                         //if the user has not given a path then, add inputdir. else leave path alone.
135                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
136                                 }
137                                 
138                                 it = parameters.find("taxonomy");
139                                 //user has given a template file
140                                 if(it != parameters.end()){ 
141                                         path = m->hasPath(it->second);
142                                         //if the user has not given a path then, add inputdir. else leave path alone.
143                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
144                                 }
145                                 
146                                 it = parameters.find("fasta");
147                                 //user has given a template file
148                                 if(it != parameters.end()){ 
149                                         path = m->hasPath(it->second);
150                                         //if the user has not given a path then, add inputdir. else leave path alone.
151                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
152                                 }
153                         }
154                         
155                         //check for required parameters
156                         phylipfile = validParameter.validFile(parameters, "phylip", true);
157                         if (phylipfile == "not open") { abort = true; }
158                         else if (phylipfile == "not found") { phylipfile = ""; }        
159                         else {  distfile = phylipfile;  format = "phylip";      }
160                         
161                         columnfile = validParameter.validFile(parameters, "column", true);
162                         if (columnfile == "not open") { abort = true; } 
163                         else if (columnfile == "not found") { columnfile = ""; }
164                         else {  distfile = columnfile; format = "column";       }
165                         
166                         namefile = validParameter.validFile(parameters, "name", true);
167                         if (namefile == "not open") { abort = true; }   
168                         else if (namefile == "not found") { namefile = ""; }
169                         
170                         fastafile = validParameter.validFile(parameters, "fasta", true);
171                         if (fastafile == "not open") { abort = true; }  
172                         else if (fastafile == "not found") { fastafile = ""; }
173                         else { distfile = fastafile;  splitmethod = "fasta";  }
174                         
175                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
176                         if (taxFile == "not open") { abort = true; }    
177                         else if (taxFile == "not found") { taxFile = ""; }
178                         
179                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
180                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
181                 
182                         if (columnfile != "") {
183                                 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
184                         }
185                         
186                         if (fastafile != "") {
187                                 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
188                                 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
189                         }
190                                         
191                         //check for optional parameter and set defaults
192                         // ...at some point should added some additional type checking...
193                         //get user cutoff and precision or use defaults
194                         string temp;
195                         temp = validParameter.validFile(parameters, "precision", false);
196                         if (temp == "not found") { temp = "100"; }
197                         //saves precision legnth for formatting below
198                         length = temp.length();
199                         convert(temp, precision); 
200                         
201                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
202                         hard = m->isTrue(temp);
203                         
204                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
205                         large = m->isTrue(temp);
206                         
207                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
208                         convert(temp, processors); 
209                         
210                         temp = validParameter.validFile(parameters, "splitmethod", false);      
211                         if (splitmethod != "fasta") {
212                                 if (temp == "not found")  { splitmethod = "distance"; }
213                                 else {  splitmethod = temp; }
214                         }
215                         
216                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
217                         convert(temp, cutoff); 
218                         cutoff += (5 / (precision * 10.0));  
219                         
220                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
221                         convert(temp, taxLevelCutoff); 
222                         
223                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
224                         
225                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
226                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
227                         
228                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
229                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
230                         
231                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
232
233                         showabund = validParameter.validFile(parameters, "showabund", false);
234                         if (showabund == "not found") { showabund = "T"; }
235
236                         timing = validParameter.validFile(parameters, "timing", false);
237                         if (timing == "not found") { timing = "F"; }
238                         
239                 }
240         }
241         catch(exception& e) {
242                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
243                 exit(1);
244         }
245 }
246
247 //**********************************************************************************************************************
248
249 void ClusterSplitCommand::help(){
250         try {
251                 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
252                 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
253                 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
254                 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n");
255                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n");
256                 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n");
257                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
258                 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
259                 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
260                 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
261                 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
262                 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
263                 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
264                 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
265                 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
266                 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n");
267                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
268                 #ifdef USE_MPI
269                 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
270                 #endif
271                 m->mothurOut("The cluster.split command should be in the following format: \n");
272                 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
273                 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");       
274
275         }
276         catch(exception& e) {
277                 m->errorOut(e, "ClusterSplitCommand", "help");
278                 exit(1);
279         }
280 }
281
282 //**********************************************************************************************************************
283
284 ClusterSplitCommand::~ClusterSplitCommand(){}
285
286 //**********************************************************************************************************************
287
288 int ClusterSplitCommand::execute(){
289         try {
290         
291                 if (abort == true) {    return 0;       }
292                 
293                 time_t estart;
294                 vector<string> listFileNames;
295                 set<string> labels;
296                 string singletonName = "";
297                 double saveCutoff = cutoff;
298
299                 //****************** file prep work ******************************//
300                 #ifdef USE_MPI
301                         int pid;
302                         int tag = 2001;
303                         MPI_Status status; 
304                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
305                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
306                         
307                         if (pid == 0) { //only process 0 converts and splits
308                         
309                 #endif
310                 
311                 //if user gave a phylip file convert to column file
312                 if (format == "phylip") {
313                         estart = time(NULL);
314                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
315                         
316                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
317                         
318                         NameAssignment* nameMap = NULL;
319                         convert->setFormat("phylip");
320                         convert->read(nameMap);
321                         
322                         if (m->control_pressed) {  delete convert;  return 0;  }
323                         
324                         distfile = convert->getOutputFile();
325                 
326                         //if no names file given with phylip file, create it
327                         ListVector* listToMakeNameFile =  convert->getListVector();
328                         if (namefile == "") {  //you need to make a namefile for split matrix
329                                 ofstream out;
330                                 namefile = phylipfile + ".names";
331                                 m->openOutputFile(namefile, out);
332                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
333                                         string bin = listToMakeNameFile->get(i);
334                                         out << bin << '\t' << bin << endl;
335                                 }
336                                 out.close();
337                         }
338                         delete listToMakeNameFile;
339                         delete convert;
340                         
341                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
342                 }
343                 if (m->control_pressed) { return 0; }
344                 
345                 estart = time(NULL);
346                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
347                 
348                 //split matrix into non-overlapping groups
349                 SplitMatrix* split;
350                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
351                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
352                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir);      }
353                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
354                 
355                 split->split();
356                 
357                 if (m->control_pressed) { delete split; return 0; }
358                 
359                 singletonName = split->getSingletonNames();
360                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
361                 delete split;
362                 
363                 if (m->control_pressed) { return 0; }
364                 
365                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
366                 estart = time(NULL);
367                 
368                 //****************** break up files between processes and cluster each file set ******************************//
369         #ifdef USE_MPI
370                         ////you are process 0 from above////
371                         
372                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...                           
373                         dividedNames.resize(processors);
374                                         
375                         //for each file group figure out which process will complete it
376                         //want to divide the load intelligently so the big files are spread between processes
377                         int count = 1;
378                         for (int i = 0; i < distName.size(); i++) { 
379                                 int processToAssign = (i+1) % processors; 
380                                 if (processToAssign == 0) { processToAssign = processors; }
381                                                 
382                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
383                         }
384                                         
385                         //not lets reverse the order of ever other process, so we balance big files running with little ones
386                         for (int i = 0; i < processors; i++) {
387                                 int remainder = ((i+1) % processors);
388                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
389                         }
390                         
391                         
392                         //send each child the list of files it needs to process
393                         for(int i = 1; i < processors; i++) { 
394                                 //send number of file pairs
395                                 int num = dividedNames[i].size();
396                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
397                                 
398                                 for (int j = 0; j < num; j++) { //send filenames to process i
399                                         char tempDistFileName[1024];
400                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
401                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
402                                         
403                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
404                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
405                                         
406                                         char tempNameFileName[1024];
407                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
408                                         int lengthName = (dividedNames[i][j].begin()->second).length();
409
410                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
411                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
412                                 }
413                         }
414                         
415                         //process your share
416                         listFileNames = cluster(dividedNames[0], labels);
417                         
418                         //receive the other processes info
419                         for(int i = 1; i < processors; i++) { 
420                                 int num = dividedNames[i].size();
421                                 
422                                 double tempCutoff;
423                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
424                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
425                                 
426                                 //send list filenames to root process
427                                 for (int j = 0; j < num; j++) {  
428                                         int lengthList = 0;
429                                         char tempListFileName[1024];
430                                 
431                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
432                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
433                                 
434                                         string myListFileName = tempListFileName;
435                                         myListFileName = myListFileName.substr(0, lengthList);
436                                         
437                                         listFileNames.push_back(myListFileName);
438                                 }
439                                 
440                                 //send Labels to root process
441                                 int numLabels = 0;
442                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
443                                 
444                                 for (int j = 0; j < numLabels; j++) {  
445                                         int lengthLabel = 0;
446                                         char tempLabel[100];
447                                 
448                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
449                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
450                                 
451                                         string myLabel = tempLabel;
452                                         myLabel = myLabel.substr(0, lengthLabel);
453                         
454                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
455                                 }
456                         }
457                         
458                 }else { //you are a child process
459                         vector < map<string, string> >  myNames;
460                         
461                         //recieve the files you need to process
462                         //receive number of file pairs
463                         int num = 0;
464                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
465                         
466                         myNames.resize(num);
467         
468                         for (int j = 0; j < num; j++) { //receive filenames to process 
469                                 int lengthDist = 0;
470                                 char tempDistFileName[1024];
471                                 
472                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
473                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
474                                 
475                                 string myDistFileName = tempDistFileName;
476                                 myDistFileName = myDistFileName.substr(0, lengthDist);
477                         
478                                 int lengthName = 0;
479                                 char tempNameFileName[1024];
480                                 
481                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
482                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
483                                 
484                                 string myNameFileName = tempNameFileName;
485                                 myNameFileName = myNameFileName.substr(0, lengthName);
486                                 
487                                 //save file name
488                                 myNames[j][myDistFileName] = myNameFileName;
489                         }
490         
491                         //process them
492                         listFileNames = cluster(myNames, labels);
493                         
494                         //send cutoff
495                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
496                         
497                         //send list filenames to root process
498                         for (int j = 0; j < num; j++) {  
499                                 char tempListFileName[1024];
500                                 strcpy(tempListFileName, listFileNames[j].c_str());
501                                 int lengthList = listFileNames[j].length();
502                                         
503                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
504                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
505                         }
506                         
507                         //send Labels to root process
508                         int numLabels = labels.size();
509                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
510                         
511                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
512                                 char tempLabel[100];
513                                 strcpy(tempLabel, (*it).c_str());
514                                 int lengthLabel = (*it).length();
515                                         
516                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
517                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
518                         }
519                 }
520                 
521                 //make everyone wait
522                 MPI_Barrier(MPI_COMM_WORLD);
523                 
524         #else
525
526                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
527                                 if(processors == 1){
528                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
529                                 }else{
530                                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
531                                         dividedNames.resize(processors);
532                                         
533                                         //for each file group figure out which process will complete it
534                                         //want to divide the load intelligently so the big files are spread between processes
535                                         int count = 1;
536                                         for (int i = 0; i < distName.size(); i++) { 
537                                                 int processToAssign = (i+1) % processors; 
538                                                 if (processToAssign == 0) { processToAssign = processors; }
539                                                 
540                                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
541                                         }
542                                         
543                                         //not lets reverse the order of ever other process, so we balance big files running with little ones
544                                         for (int i = 0; i < processors; i++) {
545                                                 int remainder = ((i+1) % processors);
546                                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
547                                         }
548                                         
549                                         createProcesses(dividedNames);
550                                                         
551                                         if (m->control_pressed) { return 0; }
552
553                                         //get list of list file names from each process
554                                         for(int i=0;i<processors;i++){
555                                                 string filename = toString(processIDS[i]) + ".temp";
556                                                 ifstream in;
557                                                 m->openInputFile(filename, in);
558                                                 
559                                                 in >> tag; m->gobble(in);
560                                                 
561                                                 while(!in.eof()) {
562                                                         string tempName;
563                                                         in >> tempName; m->gobble(in);
564                                                         listFileNames.push_back(tempName);
565                                                 }
566                                                 in.close();
567                                                 remove((toString(processIDS[i]) + ".temp").c_str());
568                                                 
569                                                 //get labels
570                                                 filename = toString(processIDS[i]) + ".temp.labels";
571                                                 ifstream in2;
572                                                 m->openInputFile(filename, in2);
573                                                 
574                                                 float tempCutoff;
575                                                 in2 >> tempCutoff; m->gobble(in2);
576                                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
577                                                 
578                                                 while(!in2.eof()) {
579                                                         string tempName;
580                                                         in2 >> tempName; m->gobble(in2);
581                                                         if (labels.count(tempName) == 0) { labels.insert(tempName); }
582                                                 }
583                                                 in2.close();
584                                                 remove((toString(processIDS[i]) + ".temp.labels").c_str());
585                                         }
586                                 }
587                 #else
588                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
589                 #endif
590         #endif  
591                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
592                 
593                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
594                 
595                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
596                 
597                 //****************** merge list file and create rabund and sabund files ******************************//
598                 estart = time(NULL);
599                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
600                 
601                 #ifdef USE_MPI
602                         if (pid == 0) { //only process 0 merges
603                 #endif
604
605                 ListVector* listSingle;
606                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
607                 
608                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
609                 
610                 mergeLists(listFileNames, labelBins, listSingle);
611
612                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
613                 
614                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
615                 
616                 m->mothurOutEndLine();
617                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
618                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
619                 m->mothurOutEndLine();
620                 
621                 #ifdef USE_MPI
622                         } //only process 0 merges
623                         
624                         //make everyone wait
625                         MPI_Barrier(MPI_COMM_WORLD);
626                 #endif
627
628                 return 0;
629         }
630         catch(exception& e) {
631                 m->errorOut(e, "ClusterSplitCommand", "execute");
632                 exit(1);
633         }
634 }
635 //**********************************************************************************************************************
636 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
637         try {
638                                 
639                 map<float, int> labelBin;
640                 vector<float> orderFloat;
641                 int numSingleBins;
642                 
643                 //read in singletons
644                 if (singleton != "none") {
645                         ifstream in;
646                         m->openInputFile(singleton, in);
647                                 
648                         string firstCol, secondCol;
649                         listSingle = new ListVector();
650                         while (!in.eof()) {
651                                 in >> firstCol >> secondCol; m->gobble(in);
652                                 listSingle->push_back(secondCol);
653                         }
654                         in.close();
655                         remove(singleton.c_str());
656                         
657                         numSingleBins = listSingle->getNumBins();
658                 }else{  listSingle = NULL; numSingleBins = 0;  }
659                 
660                 //go through users set and make them floats so we can sort them 
661                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
662                         float temp = -10.0;
663
664                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
665                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
666                         
667                         if (temp <= cutoff) {
668                                 orderFloat.push_back(temp);
669                                 labelBin[temp] = numSingleBins; //initialize numbins 
670                         }
671                 }
672         
673                 //sort order
674                 sort(orderFloat.begin(), orderFloat.end());
675                 userLabels.clear();
676                         
677                 //get the list info from each file
678                 for (int k = 0; k < listNames.size(); k++) {
679         
680                         if (m->control_pressed) {  
681                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
682                                 for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
683                                 return labelBin;
684                         }
685                         
686                         InputData* input = new InputData(listNames[k], "list");
687                         ListVector* list = input->getListVector();
688                         string lastLabel = list->getLabel();
689                         
690                         string filledInList = listNames[k] + "filledInTemp";
691                         ofstream outFilled;
692                         m->openOutputFile(filledInList, outFilled);
693         
694                         //for each label needed
695                         for(int l = 0; l < orderFloat.size(); l++){
696                         
697                                 string thisLabel;
698                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
699                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
700
701                                 //this file has reached the end
702                                 if (list == NULL) { 
703                                         list = input->getListVector(lastLabel, true); 
704                                 }else{  //do you have the distance, or do you need to fill in
705                                                 
706                                         float labelFloat;
707                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
708                                         else { convert(list->getLabel(), labelFloat); }
709
710                                         //check for missing labels
711                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
712                                                 //if its bigger get last label, otherwise keep it
713                                                 delete list;
714                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
715                                         }
716                                         lastLabel = list->getLabel();
717                                 }
718                                 
719                                 //print to new file
720                                 list->setLabel(thisLabel);
721                                 list->print(outFilled);
722                 
723                                 //update labelBin
724                                 labelBin[orderFloat[l]] += list->getNumBins();
725                                                                         
726                                 delete list;
727                                                                         
728                                 list = input->getListVector();
729                         }
730                         
731                         if (list != NULL) { delete list; }
732                         delete input;
733                         
734                         outFilled.close();
735                         remove(listNames[k].c_str());
736                         rename(filledInList.c_str(), listNames[k].c_str());
737                 }
738                 
739                 return labelBin;
740         }
741         catch(exception& e) {
742                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
743                 exit(1);
744         }
745 }
746 //**********************************************************************************************************************
747 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
748         try {
749                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
750                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
751                 
752                 m->openOutputFile(fileroot+ tag + ".sabund",    outSabund);
753                 m->openOutputFile(fileroot+ tag + ".rabund",    outRabund);
754                 m->openOutputFile(fileroot+ tag + ".list",              outList);
755                                 
756                 outputNames.push_back(fileroot+ tag + ".sabund");  outputTypes["list"].push_back(fileroot+ tag + ".list");
757                 outputNames.push_back(fileroot+ tag + ".rabund");  outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
758                 outputNames.push_back(fileroot+ tag + ".list");    outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
759                 
760                 map<float, int>::iterator itLabel;
761
762                 //for each label needed
763                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
764                         
765                         string thisLabel;
766                         if (itLabel->first == -1) { thisLabel = "unique"; }
767                         else { thisLabel = toString(itLabel->first,  length-1);  } 
768                         
769                         outList << thisLabel << '\t' << itLabel->second << '\t';
770
771                         RAbundVector* rabund = new RAbundVector();
772                         rabund->setLabel(thisLabel);
773
774                         //add in singletons
775                         if (listSingle != NULL) {
776                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
777                                         outList << listSingle->get(j) << '\t';
778                                         rabund->push_back(m->getNumNames(listSingle->get(j)));
779                                 }
780                         }
781                         
782                         //get the list info from each file
783                         for (int k = 0; k < listNames.size(); k++) {
784         
785                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
786                                 
787                                 InputData* input = new InputData(listNames[k], "list");
788                                 ListVector* list = input->getListVector(thisLabel);
789                                 
790                                 //this file has reached the end
791                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
792                                 else {          
793                                         for (int j = 0; j < list->getNumBins(); j++) {
794                                                 outList << list->get(j) << '\t';
795                                                 rabund->push_back(m->getNumNames(list->get(j)));
796                                         }
797                                         delete list;
798                                 }
799                                 delete input;
800                         }
801                         
802                         SAbundVector sabund = rabund->getSAbundVector();
803                         
804                         sabund.print(outSabund);
805                         rabund->print(outRabund);
806                         outList << endl;
807                         
808                         delete rabund;
809                 }
810                 
811                 outList.close();
812                 outRabund.close();
813                 outSabund.close();
814                 
815                 if (listSingle != NULL) { delete listSingle;  }
816                 
817                 for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
818                 
819                 return 0;
820         }
821         catch(exception& e) {
822                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
823                 exit(1);
824         }
825 }
826
827 //**********************************************************************************************************************
828
829 void ClusterSplitCommand::printData(ListVector* oldList){
830         try {
831                 string label = oldList->getLabel();
832                 RAbundVector oldRAbund = oldList->getRAbundVector();
833                 
834                 oldRAbund.setLabel(label);
835                 if (m->isTrue(showabund)) {
836                         oldRAbund.getSAbundVector().print(cout);
837                 }
838                 oldRAbund.print(outRabund);
839                 oldRAbund.getSAbundVector().print(outSabund);
840         
841                 oldList->print(outList);
842         }
843         catch(exception& e) {
844                 m->errorOut(e, "ClusterSplitCommand", "printData");
845                 exit(1);
846         }
847 }
848 //**********************************************************************************************************************
849 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
850         try {
851         
852         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
853                 int process = 0;
854                 int exitCommand = 1;
855                 processIDS.clear();
856                 
857                 //loop through and create all the processes you want
858                 while (process != processors) {
859                         int pid = fork();
860                         
861                         if (pid > 0) {
862                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
863                                 process++;
864                         }else if (pid == 0){
865                                 set<string> labels;
866                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
867                                 
868                                 //write out names to file
869                                 string filename = toString(getpid()) + ".temp";
870                                 ofstream out;
871                                 m->openOutputFile(filename, out);
872                                 out << tag << endl;
873                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
874                                 out.close();
875                                 
876                                 //print out labels
877                                 ofstream outLabels;
878                                 filename = toString(getpid()) + ".temp.labels";
879                                 m->openOutputFile(filename, outLabels);
880                                 
881                                 outLabels << cutoff << endl;
882                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
883                                         outLabels << (*it) << endl;
884                                 }
885                                 outLabels.close();
886
887                                 exit(0);
888                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
889                 }
890                 
891                 //force parent to wait until all the processes are done
892                 for (int i=0;i<processors;i++) { 
893                         int temp = processIDS[i];
894                         wait(&temp);
895                 }
896                 
897                 return exitCommand;
898         #endif          
899         
900         }
901         catch(exception& e) {
902                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
903                 exit(1);
904         }
905 }
906 //**********************************************************************************************************************
907
908 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
909         try {
910                 Cluster* cluster;
911                 SparseMatrix* matrix;
912                 ListVector* list;
913                 ListVector oldList;
914                 RAbundVector* rabund;
915                 
916                 vector<string> listFileNames;
917                 
918                 double smallestCutoff = cutoff;
919                 
920                 //cluster each distance file
921                 for (int i = 0; i < distNames.size(); i++) {
922                         if (m->control_pressed) { return listFileNames; }
923                         
924                         string thisNamefile = distNames[i].begin()->second;
925                         string thisDistFile = distNames[i].begin()->first;
926                         
927                         //read in distance file
928                         globaldata->setNameFile(thisNamefile);
929                         globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
930                         
931                         #ifdef USE_MPI
932                                 int pid;
933                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
934                                 
935                                 //output your files too
936                                 if (pid != 0) {
937                                         cout << endl << "Reading " << thisDistFile << endl;
938                                 }
939                         #endif
940                         
941                         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
942                         
943                         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
944                         read->setCutoff(cutoff);
945
946                         NameAssignment* nameMap = new NameAssignment(thisNamefile);
947                         nameMap->readMap();
948                         read->read(nameMap);
949                         
950                         if (m->control_pressed) {  delete read; delete nameMap; return listFileNames; }
951                         
952                         list = read->getListVector();
953                         oldList = *list;
954                         matrix = read->getMatrix();
955                         
956                         delete read; 
957                         delete nameMap; 
958                         
959                         
960                         #ifdef USE_MPI
961                                 //output your files too
962                                 if (pid != 0) {
963                                         cout << endl << "Clustering " << thisDistFile << endl;
964                                 }
965                         #endif
966                         
967                         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
968                 
969                         rabund = new RAbundVector(list->getRAbundVector());
970                         
971                         //create cluster
972                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
973                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
974                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
975                         tag = cluster->getTag();
976                 
977                         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
978                         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
979                         
980                         ofstream listFile;
981                         m->openOutputFile(fileroot+ tag + ".list",      listFile);
982                 
983                         listFileNames.push_back(fileroot+ tag + ".list");
984                 
985                         time_t estart = time(NULL);
986                         
987                         float previousDist = 0.00000;
988                         float rndPreviousDist = 0.00000;
989                         
990                         oldList = *list;
991
992                         print_start = true;
993                         start = time(NULL);
994                         double saveCutoff = cutoff;
995                 
996                         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
997                 
998                                 if (m->control_pressed) { //clean up
999                                         delete matrix; delete list;     delete cluster; delete rabund;
1000                                         listFile.close();
1001                                         for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
1002                                         listFileNames.clear(); return listFileNames;
1003                                 }
1004                 
1005                                 cluster->update(saveCutoff);
1006         
1007                                 float dist = matrix->getSmallDist();
1008                                 float rndDist;
1009                                 if (hard) {
1010                                         rndDist = m->ceilDist(dist, precision); 
1011                                 }else{
1012                                         rndDist = m->roundDist(dist, precision); 
1013                                 }
1014
1015                                 if(previousDist <= 0.0000 && dist != previousDist){
1016                                         oldList.setLabel("unique");
1017                                         oldList.print(listFile);
1018                                         if (labels.count("unique") == 0) {  labels.insert("unique");  }
1019                                 }
1020                                 else if(rndDist != rndPreviousDist){
1021                                         oldList.setLabel(toString(rndPreviousDist,  length-1));
1022                                         oldList.print(listFile);
1023                                         if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1024                                 }
1025                 
1026                                 previousDist = dist;
1027                                 rndPreviousDist = rndDist;
1028                                 oldList = *list;
1029                         }
1030
1031                 
1032                         if(previousDist <= 0.0000){
1033                                 oldList.setLabel("unique");
1034                                 oldList.print(listFile);
1035                                 if (labels.count("unique") == 0) { labels.insert("unique"); }
1036                         }
1037                         else if(rndPreviousDist<cutoff){
1038                                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1039                                 oldList.print(listFile);
1040                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1041                         }
1042         
1043                         delete matrix; delete list;     delete cluster; delete rabund; 
1044                         listFile.close();
1045                         
1046                         if (m->control_pressed) { //clean up
1047                                 for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
1048                                 listFileNames.clear(); return listFileNames;
1049                         }
1050                         
1051                         remove(thisDistFile.c_str());
1052                         remove(thisNamefile.c_str());
1053                         
1054                         if (saveCutoff != cutoff) { 
1055                                 if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1056                                 else            {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1057                         
1058                                 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  
1059                         }
1060                         
1061                         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1062                 }
1063                 
1064                 cutoff = smallestCutoff;
1065                                         
1066                 return listFileNames;
1067         
1068         }
1069         catch(exception& e) {
1070                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1071                 exit(1);
1072         }
1073
1074
1075 }
1076
1077 //**********************************************************************************************************************