]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
removed unused copy constructors and comments within comments that where causing...
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11
12
13
14 //**********************************************************************************************************************
15 vector<string> ClusterSplitCommand::setParameters(){    
16         try {
17                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
18                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
19                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
20                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
21                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
22                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
23                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
24                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
25                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
26                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
27                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
28                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff);
29                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
30                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
31                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
32                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
33                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
34                         
35                 vector<string> myArray;
36                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
37                 return myArray;
38         }
39         catch(exception& e) {
40                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
41                 exit(1);
42         }
43 }
44 //**********************************************************************************************************************
45 string ClusterSplitCommand::getHelpString(){    
46         try {
47                 string helpString = "";
48                 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n";
49                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
50                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
51                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
52                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
53                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
54                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
55                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
56                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
57                 helpString += "The name parameter allows you to enter your name file and is required if your distance file is in column format. \n";
58                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
59                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
60                 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
61                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
62                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
63                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
64                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
65 #ifdef USE_MPI
66                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
67 #endif
68                 helpString += "The cluster.split command should be in the following format: \n";
69                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
70                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";       
71                 return helpString;
72         }
73         catch(exception& e) {
74                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
75                 exit(1);
76         }
77 }
78 //**********************************************************************************************************************
79 ClusterSplitCommand::ClusterSplitCommand(){     
80         try {
81                 abort = true; calledHelp = true; 
82                 setParameters();
83                 vector<string> tempOutNames;
84                 outputTypes["list"] = tempOutNames;
85                 outputTypes["rabund"] = tempOutNames;
86                 outputTypes["sabund"] = tempOutNames;
87                 outputTypes["column"] = tempOutNames;
88         }
89         catch(exception& e) {
90                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
91                 exit(1);
92         }
93 }
94 //**********************************************************************************************************************
95 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
96 ClusterSplitCommand::ClusterSplitCommand(string option)  {
97         try{
98                 abort = false; calledHelp = false;   
99                 format = "";
100                 
101                 //allow user to run help
102                 if(option == "help") { help(); abort = true; calledHelp = true; }
103                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
104                 
105                 else {
106                         vector<string> myArray = setParameters();
107                         
108                         OptionParser parser(option);
109                         map<string,string> parameters = parser.getParameters();
110                         
111                         ValidParameters validParameter("cluster.split");
112                 
113                         //check to make sure all parameters are valid for command
114                         map<string,string>::iterator it;
115                         for (it = parameters.begin(); it != parameters.end(); it++) { 
116                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
117                                         abort = true;
118                                 }
119                         }
120                         
121                         //initialize outputTypes
122                         vector<string> tempOutNames;
123                         outputTypes["list"] = tempOutNames;
124                         outputTypes["rabund"] = tempOutNames;
125                         outputTypes["sabund"] = tempOutNames;
126                         outputTypes["column"] = tempOutNames;
127                         
128                         //if the user changes the output directory command factory will send this info to us in the output parameter 
129                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
130                         
131                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
132                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
133                         if (inputDir == "not found"){   inputDir = "";          }
134                         else {
135                                 string path;
136                                 it = parameters.find("phylip");
137                                 //user has given a template file
138                                 if(it != parameters.end()){ 
139                                         path = m->hasPath(it->second);
140                                         //if the user has not given a path then, add inputdir. else leave path alone.
141                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
142                                 }
143                                 
144                                 it = parameters.find("column");
145                                 //user has given a template file
146                                 if(it != parameters.end()){ 
147                                         path = m->hasPath(it->second);
148                                         //if the user has not given a path then, add inputdir. else leave path alone.
149                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
150                                 }
151                                 
152                                 it = parameters.find("name");
153                                 //user has given a template file
154                                 if(it != parameters.end()){ 
155                                         path = m->hasPath(it->second);
156                                         //if the user has not given a path then, add inputdir. else leave path alone.
157                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
158                                 }
159                                 
160                                 it = parameters.find("taxonomy");
161                                 //user has given a template file
162                                 if(it != parameters.end()){ 
163                                         path = m->hasPath(it->second);
164                                         //if the user has not given a path then, add inputdir. else leave path alone.
165                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
166                                 }
167                                 
168                                 it = parameters.find("fasta");
169                                 //user has given a template file
170                                 if(it != parameters.end()){ 
171                                         path = m->hasPath(it->second);
172                                         //if the user has not given a path then, add inputdir. else leave path alone.
173                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
174                                 }
175                         }
176                         
177                         //check for required parameters
178                         phylipfile = validParameter.validFile(parameters, "phylip", true);
179                         if (phylipfile == "not open") { abort = true; }
180                         else if (phylipfile == "not found") { phylipfile = ""; }        
181                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
182                         
183                         columnfile = validParameter.validFile(parameters, "column", true);
184                         if (columnfile == "not open") { abort = true; } 
185                         else if (columnfile == "not found") { columnfile = ""; }
186                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
187                         
188                         namefile = validParameter.validFile(parameters, "name", true);
189                         if (namefile == "not open") { abort = true; }   
190                         else if (namefile == "not found") { namefile = "";  }
191                         else { m->setNameFile(namefile); }
192                         
193                         fastafile = validParameter.validFile(parameters, "fasta", true);
194                         if (fastafile == "not open") { abort = true; }  
195                         else if (fastafile == "not found") { fastafile = ""; }
196                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
197                         
198                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
199                         if (taxFile == "not open") { taxFile = ""; abort = true; }      
200                         else if (taxFile == "not found") { taxFile = ""; }
201                         else {  m->setTaxonomyFile(taxFile); }
202                         
203                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { 
204                                 //is there are current file available for either of these?
205                                 //give priority to column, then phylip, then fasta
206                                 columnfile = m->getColumnFile(); 
207                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
208                                 else { 
209                                         phylipfile = m->getPhylipFile(); 
210                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
211                                         else { 
212                                                 fastafile = m->getFastaFile(); 
213                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
214                                                 else { 
215                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); 
216                                                         abort = true; 
217                                                 }
218                                         }
219                                 }
220                         }
221                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
222                 
223                         if (columnfile != "") {
224                                 if (namefile == "") { 
225                                         namefile = m->getNameFile(); 
226                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
227                                         else { 
228                                                 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); 
229                                                 abort = true; 
230                                         }       
231                                 }
232                         }
233                         
234                         if (fastafile != "") {
235                                 if (taxFile == "") { 
236                                         taxFile = m->getTaxonomyFile(); 
237                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
238                                         else { 
239                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine(); 
240                                                 abort = true; 
241                                         }       
242                                 }
243                                 
244                                 if (namefile == "") { 
245                                         namefile = m->getNameFile(); 
246                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
247                                         else { 
248                                                 m->mothurOut("You need to provide a namefile if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine(); 
249                                                 abort = true; 
250                                         }       
251                                 }
252                         }
253                                         
254                         //check for optional parameter and set defaults
255                         // ...at some point should added some additional type checking...
256                         //get user cutoff and precision or use defaults
257                         string temp;
258                         temp = validParameter.validFile(parameters, "precision", false);
259                         if (temp == "not found") { temp = "100"; }
260                         //saves precision legnth for formatting below
261                         length = temp.length();
262                         m->mothurConvert(temp, precision); 
263                         
264                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
265                         hard = m->isTrue(temp);
266                         
267                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
268                         large = m->isTrue(temp);
269                         
270                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
271                         m->setProcessors(temp);
272                         m->mothurConvert(temp, processors);
273                         
274                         temp = validParameter.validFile(parameters, "splitmethod", false);      
275                         if (splitmethod != "fasta") {
276                                 if (temp == "not found")  { splitmethod = "distance"; }
277                                 else {  splitmethod = temp; }
278                         }
279                         
280                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
281                         m->mothurConvert(temp, cutoff); 
282                         cutoff += (5 / (precision * 10.0));  
283                         
284                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
285                         m->mothurConvert(temp, taxLevelCutoff); 
286                         
287                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
288                         
289                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
290                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
291                         
292                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
293                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
294                         
295                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
296
297                         showabund = validParameter.validFile(parameters, "showabund", false);
298                         if (showabund == "not found") { showabund = "T"; }
299
300                         timing = validParameter.validFile(parameters, "timing", false);
301                         if (timing == "not found") { timing = "F"; }
302                         
303                 }
304         }
305         catch(exception& e) {
306                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
307                 exit(1);
308         }
309 }
310
311 //**********************************************************************************************************************
312
313 int ClusterSplitCommand::execute(){
314         try {
315         
316                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
317                 
318                 time_t estart;
319                 vector<string> listFileNames;
320                 set<string> labels;
321                 string singletonName = "";
322                 double saveCutoff = cutoff;
323
324                 //****************** file prep work ******************************//
325                 #ifdef USE_MPI
326                         int pid;
327                         int tag = 2001;
328                         MPI_Status status; 
329                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
330                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
331                         
332                         if (pid == 0) { //only process 0 converts and splits
333                         
334                 #endif
335                 
336                 //if user gave a phylip file convert to column file
337                 if (format == "phylip") {
338                         estart = time(NULL);
339                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
340                         
341                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
342                         
343                         NameAssignment* nameMap = NULL;
344                         convert->setFormat("phylip");
345                         convert->read(nameMap);
346                         
347                         if (m->control_pressed) {  delete convert;  return 0;  }
348                         
349                         distfile = convert->getOutputFile();
350                 
351                         //if no names file given with phylip file, create it
352                         ListVector* listToMakeNameFile =  convert->getListVector();
353                         if (namefile == "") {  //you need to make a namefile for split matrix
354                                 ofstream out;
355                                 namefile = phylipfile + ".names";
356                                 m->openOutputFile(namefile, out);
357                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
358                                         string bin = listToMakeNameFile->get(i);
359                                         out << bin << '\t' << bin << endl;
360                                 }
361                                 out.close();
362                         }
363                         delete listToMakeNameFile;
364                         delete convert;
365                         
366                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
367                 }
368                 if (m->control_pressed) { return 0; }
369                 
370                 estart = time(NULL);
371                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
372                 
373                 //split matrix into non-overlapping groups
374                 SplitMatrix* split;
375                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
376                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
377                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir);      }
378                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
379                 
380                 split->split();
381                 
382                 if (m->control_pressed) { delete split; return 0; }
383                 
384                 singletonName = split->getSingletonNames();
385                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
386                 delete split;
387                 
388                 //output a merged distance file
389                 if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
390                         
391                                 
392                 if (m->control_pressed) { return 0; }
393                 
394                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
395                 estart = time(NULL);
396                 
397                 //****************** break up files between processes and cluster each file set ******************************//
398         #ifdef USE_MPI
399                         ////you are process 0 from above////
400                         
401                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...                           
402                         dividedNames.resize(processors);
403                                         
404                         //for each file group figure out which process will complete it
405                         //want to divide the load intelligently so the big files are spread between processes
406                         for (int i = 0; i < distName.size(); i++) { 
407                                 int processToAssign = (i+1) % processors; 
408                                 if (processToAssign == 0) { processToAssign = processors; }
409                                                 
410                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
411                         }
412                                         
413                         //not lets reverse the order of ever other process, so we balance big files running with little ones
414                         for (int i = 0; i < processors; i++) {
415                                 int remainder = ((i+1) % processors);
416                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
417                         }
418                         
419                         
420                         //send each child the list of files it needs to process
421                         for(int i = 1; i < processors; i++) { 
422                                 //send number of file pairs
423                                 int num = dividedNames[i].size();
424                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
425                                 
426                                 for (int j = 0; j < num; j++) { //send filenames to process i
427                                         char tempDistFileName[1024];
428                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
429                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
430                                         
431                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
432                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
433                                         
434                                         char tempNameFileName[1024];
435                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
436                                         int lengthName = (dividedNames[i][j].begin()->second).length();
437
438                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
439                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
440                                 }
441                         }
442                         
443                         //process your share
444                         listFileNames = cluster(dividedNames[0], labels);
445                         
446                         //receive the other processes info
447                         for(int i = 1; i < processors; i++) { 
448                                 int num = dividedNames[i].size();
449                                 
450                                 double tempCutoff;
451                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
452                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
453                                 
454                                 //send list filenames to root process
455                                 for (int j = 0; j < num; j++) {  
456                                         int lengthList = 0;
457                                         char tempListFileName[1024];
458                                 
459                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
460                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
461                                 
462                                         string myListFileName = tempListFileName;
463                                         myListFileName = myListFileName.substr(0, lengthList);
464                                         
465                                         listFileNames.push_back(myListFileName);
466                                 }
467                                 
468                                 //send Labels to root process
469                                 int numLabels = 0;
470                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
471                                 
472                                 for (int j = 0; j < numLabels; j++) {  
473                                         int lengthLabel = 0;
474                                         char tempLabel[100];
475                                 
476                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
477                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
478                                 
479                                         string myLabel = tempLabel;
480                                         myLabel = myLabel.substr(0, lengthLabel);
481                         
482                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
483                                 }
484                         }
485                         
486                 }else { //you are a child process
487                         vector < map<string, string> >  myNames;
488                         
489                         //recieve the files you need to process
490                         //receive number of file pairs
491                         int num = 0;
492                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
493                         
494                         myNames.resize(num);
495         
496                         for (int j = 0; j < num; j++) { //receive filenames to process 
497                                 int lengthDist = 0;
498                                 char tempDistFileName[1024];
499                                 
500                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
501                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
502                                 
503                                 string myDistFileName = tempDistFileName;
504                                 myDistFileName = myDistFileName.substr(0, lengthDist);
505                         
506                                 int lengthName = 0;
507                                 char tempNameFileName[1024];
508                                 
509                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
510                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
511                                 
512                                 string myNameFileName = tempNameFileName;
513                                 myNameFileName = myNameFileName.substr(0, lengthName);
514                                 
515                                 //save file name
516                                 myNames[j][myDistFileName] = myNameFileName;
517                         }
518         
519                         //process them
520                         listFileNames = cluster(myNames, labels);
521                         
522                         //send cutoff
523                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
524                         
525                         //send list filenames to root process
526                         for (int j = 0; j < num; j++) {  
527                                 char tempListFileName[1024];
528                                 strcpy(tempListFileName, listFileNames[j].c_str());
529                                 int lengthList = listFileNames[j].length();
530                                         
531                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
532                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
533                         }
534                         
535                         //send Labels to root process
536                         int numLabels = labels.size();
537                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
538                         
539                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
540                                 char tempLabel[100];
541                                 strcpy(tempLabel, (*it).c_str());
542                                 int lengthLabel = (*it).length();
543                                         
544                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
545                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
546                         }
547                 }
548                 
549                 //make everyone wait
550                 MPI_Barrier(MPI_COMM_WORLD);
551                 
552         #else
553                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
554                 //sanity check
555                 if (processors > distName.size()) { processors = distName.size(); }
556                 
557                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
558                                 if(processors == 1){
559                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
560                                 }else{
561                                         listFileNames = createProcesses(distName, labels);
562                 }
563                 #else
564                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
565                 #endif
566         #endif  
567                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
568                 
569                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
570                 
571                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
572                 
573                 //****************** merge list file and create rabund and sabund files ******************************//
574                 estart = time(NULL);
575                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
576                 
577                 #ifdef USE_MPI
578                         if (pid == 0) { //only process 0 merges
579                 #endif
580
581                 ListVector* listSingle;
582                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
583                 
584                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
585                 
586                 mergeLists(listFileNames, labelBins, listSingle);
587
588                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
589                 
590                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
591                 
592                 //set list file as new current listfile
593                 string current = "";
594                 itTypes = outputTypes.find("list");
595                 if (itTypes != outputTypes.end()) {
596                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
597                 }
598                 
599                 //set rabund file as new current rabundfile
600                 itTypes = outputTypes.find("rabund");
601                 if (itTypes != outputTypes.end()) {
602                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
603                 }
604                 
605                 //set sabund file as new current sabundfile
606                 itTypes = outputTypes.find("sabund");
607                 if (itTypes != outputTypes.end()) {
608                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
609                 }
610                                 
611                 m->mothurOutEndLine();
612                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
613                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
614                 m->mothurOutEndLine();
615                 
616                 #ifdef USE_MPI
617                         } //only process 0 merges
618                         
619                         //make everyone wait
620                         MPI_Barrier(MPI_COMM_WORLD);
621                 #endif
622
623                 return 0;
624         }
625         catch(exception& e) {
626                 m->errorOut(e, "ClusterSplitCommand", "execute");
627                 exit(1);
628         }
629 }
630 //**********************************************************************************************************************
631 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
632         try {
633                                 
634                 map<float, int> labelBin;
635                 vector<float> orderFloat;
636                 int numSingleBins;
637                 
638                 //read in singletons
639                 if (singleton != "none") {
640                         ifstream in;
641                         m->openInputFile(singleton, in);
642                                 
643                         string firstCol, secondCol;
644                         listSingle = new ListVector();
645                         while (!in.eof()) {
646                                 in >> firstCol >> secondCol; m->gobble(in);
647                                 listSingle->push_back(secondCol);
648                         }
649                         in.close();
650                         m->mothurRemove(singleton);
651                         
652                         numSingleBins = listSingle->getNumBins();
653                 }else{  listSingle = NULL; numSingleBins = 0;  }
654                 
655                 //go through users set and make them floats so we can sort them 
656                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
657                         float temp = -10.0;
658
659                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
660                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
661                         
662                         if (temp <= cutoff) {
663                                 orderFloat.push_back(temp);
664                                 labelBin[temp] = numSingleBins; //initialize numbins 
665                         }
666                 }
667         
668                 //sort order
669                 sort(orderFloat.begin(), orderFloat.end());
670                 userLabels.clear();
671                         
672                 //get the list info from each file
673                 for (int k = 0; k < listNames.size(); k++) {
674         
675                         if (m->control_pressed) {  
676                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
677                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
678                                 return labelBin;
679                         }
680                         
681                         InputData* input = new InputData(listNames[k], "list");
682                         ListVector* list = input->getListVector();
683                         string lastLabel = list->getLabel();
684                         
685                         string filledInList = listNames[k] + "filledInTemp";
686                         ofstream outFilled;
687                         m->openOutputFile(filledInList, outFilled);
688         
689                         //for each label needed
690                         for(int l = 0; l < orderFloat.size(); l++){
691                         
692                                 string thisLabel;
693                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
694                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
695
696                                 //this file has reached the end
697                                 if (list == NULL) { 
698                                         list = input->getListVector(lastLabel, true); 
699                                 }else{  //do you have the distance, or do you need to fill in
700                                                 
701                                         float labelFloat;
702                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
703                                         else { convert(list->getLabel(), labelFloat); }
704
705                                         //check for missing labels
706                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
707                                                 //if its bigger get last label, otherwise keep it
708                                                 delete list;
709                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
710                                         }
711                                         lastLabel = list->getLabel();
712                                 }
713                                 
714                                 //print to new file
715                                 list->setLabel(thisLabel);
716                                 list->print(outFilled);
717                 
718                                 //update labelBin
719                                 labelBin[orderFloat[l]] += list->getNumBins();
720                                                                         
721                                 delete list;
722                                                                         
723                                 list = input->getListVector();
724                         }
725                         
726                         if (list != NULL) { delete list; }
727                         delete input;
728                         
729                         outFilled.close();
730                         m->mothurRemove(listNames[k]);
731                         rename(filledInList.c_str(), listNames[k].c_str());
732                 }
733                 
734                 return labelBin;
735         }
736         catch(exception& e) {
737                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
738                 exit(1);
739         }
740 }
741 //**********************************************************************************************************************
742 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
743         try {
744                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
745                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
746                 
747                 m->openOutputFile(fileroot+ tag + ".sabund",    outSabund);
748                 m->openOutputFile(fileroot+ tag + ".rabund",    outRabund);
749                 m->openOutputFile(fileroot+ tag + ".list",              outList);
750                                 
751                 outputNames.push_back(fileroot+ tag + ".sabund");  outputTypes["list"].push_back(fileroot+ tag + ".list");
752                 outputNames.push_back(fileroot+ tag + ".rabund");  outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
753                 outputNames.push_back(fileroot+ tag + ".list");    outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
754                 
755                 map<float, int>::iterator itLabel;
756
757                 //for each label needed
758                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
759                         
760                         string thisLabel;
761                         if (itLabel->first == -1) { thisLabel = "unique"; }
762                         else { thisLabel = toString(itLabel->first,  length-1);  } 
763                         
764                         outList << thisLabel << '\t' << itLabel->second << '\t';
765
766                         RAbundVector* rabund = new RAbundVector();
767                         rabund->setLabel(thisLabel);
768
769                         //add in singletons
770                         if (listSingle != NULL) {
771                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
772                                         outList << listSingle->get(j) << '\t';
773                                         rabund->push_back(m->getNumNames(listSingle->get(j)));
774                                 }
775                         }
776                         
777                         //get the list info from each file
778                         for (int k = 0; k < listNames.size(); k++) {
779         
780                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } delete rabund; return 0; }
781                                 
782                                 InputData* input = new InputData(listNames[k], "list");
783                                 ListVector* list = input->getListVector(thisLabel);
784                                 
785                                 //this file has reached the end
786                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
787                                 else {          
788                                         for (int j = 0; j < list->getNumBins(); j++) {
789                                                 outList << list->get(j) << '\t';
790                                                 rabund->push_back(m->getNumNames(list->get(j)));
791                                         }
792                                         delete list;
793                                 }
794                                 delete input;
795                         }
796                         
797                         SAbundVector sabund = rabund->getSAbundVector();
798                         
799                         sabund.print(outSabund);
800                         rabund->print(outRabund);
801                         outList << endl;
802                         
803                         delete rabund;
804                 }
805                 
806                 outList.close();
807                 outRabund.close();
808                 outSabund.close();
809                 
810                 if (listSingle != NULL) { delete listSingle;  }
811                 
812                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
813                 
814                 return 0;
815         }
816         catch(exception& e) {
817                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
818                 exit(1);
819         }
820 }
821
822 //**********************************************************************************************************************
823
824 void ClusterSplitCommand::printData(ListVector* oldList){
825         try {
826                 string label = oldList->getLabel();
827                 RAbundVector oldRAbund = oldList->getRAbundVector();
828                 
829                 oldRAbund.setLabel(label);
830                 if (m->isTrue(showabund)) {
831                         oldRAbund.getSAbundVector().print(cout);
832                 }
833                 oldRAbund.print(outRabund);
834                 oldRAbund.getSAbundVector().print(outSabund);
835         
836                 oldList->print(outList);
837         }
838         catch(exception& e) {
839                 m->errorOut(e, "ClusterSplitCommand", "printData");
840                 exit(1);
841         }
842 }
843 //**********************************************************************************************************************
844 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
845         try {
846         
847         vector<string> listFiles;
848         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
849         dividedNames.resize(processors);
850         
851         //for each file group figure out which process will complete it
852         //want to divide the load intelligently so the big files are spread between processes
853         for (int i = 0; i < distName.size(); i++) { 
854             //cout << i << endl;
855             int processToAssign = (i+1) % processors; 
856             if (processToAssign == 0) { processToAssign = processors; }
857             
858             dividedNames[(processToAssign-1)].push_back(distName[i]);
859             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
860         }
861         
862         //not lets reverse the order of ever other process, so we balance big files running with little ones
863         for (int i = 0; i < processors; i++) {
864             //cout << i << endl;
865             int remainder = ((i+1) % processors);
866             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
867         }
868         
869         if (m->control_pressed) { return listFiles; }
870         
871         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
872                 int process = 1;
873                 processIDS.clear();
874                 
875                 //loop through and create all the processes you want
876                 while (process != processors) {
877                         int pid = fork();
878                         
879                         if (pid > 0) {
880                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
881                                 process++;
882                         }else if (pid == 0){
883                                 set<string> labels;
884                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
885                                 
886                                 //write out names to file
887                                 string filename = toString(getpid()) + ".temp";
888                                 ofstream out;
889                                 m->openOutputFile(filename, out);
890                                 out << tag << endl;
891                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
892                                 out.close();
893                                 
894                                 //print out labels
895                                 ofstream outLabels;
896                                 filename = toString(getpid()) + ".temp.labels";
897                                 m->openOutputFile(filename, outLabels);
898                                 
899                                 outLabels << cutoff << endl;
900                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
901                                         outLabels << (*it) << endl;
902                                 }
903                                 outLabels.close();
904
905                                 exit(0);
906                         }else { 
907                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
908                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
909                                 exit(0);
910                         }
911                 }
912                 
913         //do your part
914         listFiles = cluster(dividedNames[0], labels);
915         
916                 //force parent to wait until all the processes are done
917                 for (int i=0;i< processIDS.size();i++) { 
918                         int temp = processIDS[i];
919                         wait(&temp);
920                 }
921         
922         //get list of list file names from each process
923         for(int i=0;i<processIDS.size();i++){
924             string filename = toString(processIDS[i]) + ".temp";
925             ifstream in;
926             m->openInputFile(filename, in);
927             
928             in >> tag; m->gobble(in);
929             
930             while(!in.eof()) {
931                 string tempName;
932                 in >> tempName; m->gobble(in);
933                 listFiles.push_back(tempName);
934             }
935             in.close();
936             m->mothurRemove((toString(processIDS[i]) + ".temp"));
937             
938             //get labels
939             filename = toString(processIDS[i]) + ".temp.labels";
940             ifstream in2;
941             m->openInputFile(filename, in2);
942             
943             float tempCutoff;
944             in2 >> tempCutoff; m->gobble(in2);
945             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
946             
947             while(!in2.eof()) {
948                 string tempName;
949                 in2 >> tempName; m->gobble(in2);
950                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
951             }
952             in2.close();
953             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
954         }
955         
956
957     #else
958        
959         //////////////////////////////////////////////////////////////////////////////////////////////////////
960                 //Windows version shared memory, so be careful when passing variables through the clusterData struct. 
961                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
962                 //Taking advantage of shared memory to allow both threads to add labels.
963                 //////////////////////////////////////////////////////////////////////////////////////////////////////
964                 
965                 vector<clusterData*> pDataArray; 
966                 DWORD   dwThreadIdArray[processors-1];
967                 HANDLE  hThreadArray[processors-1]; 
968                 
969                 //Create processor worker threads.
970                 for( int i=1; i<processors; i++ ){
971                         // Allocate memory for thread data.
972                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
973                         pDataArray.push_back(tempCluster);
974                         processIDS.push_back(i);
975             
976                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
977                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
978                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);  
979             
980                 }
981         
982         //do your part
983         listFiles = cluster(dividedNames[0], labels);
984         
985                 //Wait until all threads have terminated.
986                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
987                 
988                 //Close all thread handles and free memory allocations.
989                 for(int i=0; i < pDataArray.size(); i++){
990             //get tag
991             tag = pDataArray[i]->tag;
992             //get listfiles created
993             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
994             //get labels
995             set<string>::iterator it;
996             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
997                         //check cutoff
998             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
999                         CloseHandle(hThreadArray[i]);
1000                         delete pDataArray[i];
1001                 }
1002
1003         #endif          
1004         
1005         return listFiles;
1006         
1007         }
1008         catch(exception& e) {
1009                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1010                 exit(1);
1011         }
1012 }
1013 //**********************************************************************************************************************
1014
1015 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1016         try {
1017                 
1018                 vector<string> listFileNames;
1019                 double smallestCutoff = cutoff;
1020                 
1021                 //cluster each distance file
1022                 for (int i = 0; i < distNames.size(); i++) {
1023             
1024             Cluster* cluster = NULL;
1025             SparseMatrix* matrix = NULL;
1026             ListVector* list = NULL;
1027             ListVector oldList;
1028             RAbundVector* rabund = NULL;
1029             
1030                         if (m->control_pressed) { return listFileNames; }
1031                         
1032                         string thisNamefile = distNames[i].begin()->second;
1033                         string thisDistFile = distNames[i].begin()->first;
1034                                                 
1035                         #ifdef USE_MPI
1036                                 int pid;
1037                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1038                                 
1039                                 //output your files too
1040                                 if (pid != 0) {
1041                                         cout << endl << "Reading " << thisDistFile << endl;
1042                                 }
1043                         #endif
1044                         
1045                         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1046                         
1047                         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
1048                         read->setCutoff(cutoff);
1049
1050                         NameAssignment* nameMap = new NameAssignment(thisNamefile);
1051                         nameMap->readMap();
1052                         read->read(nameMap);
1053                         
1054                         if (m->control_pressed) {  delete read; delete nameMap; return listFileNames; }
1055                         
1056                         list = read->getListVector();
1057                         oldList = *list;
1058                         matrix = read->getMatrix();
1059                         
1060                         delete read;  read = NULL;
1061                         delete nameMap; nameMap = NULL;
1062                         
1063                         
1064                         #ifdef USE_MPI
1065                                 //output your files too
1066                                 if (pid != 0) {
1067                                         cout << endl << "Clustering " << thisDistFile << endl;
1068                                 }
1069                         #endif
1070                         
1071                         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1072                 
1073                         rabund = new RAbundVector(list->getRAbundVector());
1074                         
1075                         //create cluster
1076                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1077                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1078                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
1079                         tag = cluster->getTag();
1080                 
1081                         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1082                         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1083                         
1084                         ofstream listFile;
1085                         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1086                 
1087                         listFileNames.push_back(fileroot+ tag + ".list");
1088                                 
1089                         float previousDist = 0.00000;
1090                         float rndPreviousDist = 0.00000;
1091                         
1092                         oldList = *list;
1093
1094                         print_start = true;
1095                         start = time(NULL);
1096                         double saveCutoff = cutoff;
1097                 
1098                         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1099                 
1100                                 if (m->control_pressed) { //clean up
1101                                         delete matrix; delete list;     delete cluster; delete rabund;
1102                                         listFile.close();
1103                                         for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1104                                         listFileNames.clear(); return listFileNames;
1105                                 }
1106                 
1107                                 cluster->update(saveCutoff);
1108         
1109                                 float dist = matrix->getSmallDist();
1110                                 float rndDist;
1111                                 if (hard) {
1112                                         rndDist = m->ceilDist(dist, precision); 
1113                                 }else{
1114                                         rndDist = m->roundDist(dist, precision); 
1115                                 }
1116
1117                                 if(previousDist <= 0.0000 && dist != previousDist){
1118                                         oldList.setLabel("unique");
1119                                         oldList.print(listFile);
1120                                         if (labels.count("unique") == 0) {  labels.insert("unique");  }
1121                                 }
1122                                 else if(rndDist != rndPreviousDist){
1123                                         oldList.setLabel(toString(rndPreviousDist,  length-1));
1124                                         oldList.print(listFile);
1125                                         if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1126                                 }
1127                 
1128                                 previousDist = dist;
1129                                 rndPreviousDist = rndDist;
1130                                 oldList = *list;
1131                         }
1132
1133                 
1134                         if(previousDist <= 0.0000){
1135                                 oldList.setLabel("unique");
1136                                 oldList.print(listFile);
1137                                 if (labels.count("unique") == 0) { labels.insert("unique"); }
1138                         }
1139                         else if(rndPreviousDist<cutoff){
1140                                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1141                                 oldList.print(listFile);
1142                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1143                         }
1144         
1145                         delete matrix; delete list;     delete cluster; delete rabund; 
1146             matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1147                         listFile.close();
1148                         
1149                         if (m->control_pressed) { //clean up
1150                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1151                                 listFileNames.clear(); return listFileNames;
1152                         }
1153                         
1154                         m->mothurRemove(thisDistFile);
1155                         m->mothurRemove(thisNamefile);
1156                         
1157                         if (saveCutoff != cutoff) { 
1158                                 if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1159                                 else            {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1160                         
1161                                 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  
1162                         }
1163                         
1164                         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1165                 }
1166                 
1167                 cutoff = smallestCutoff;
1168                                         
1169                 return listFileNames;
1170         
1171         }
1172         catch(exception& e) {
1173                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1174                 exit(1);
1175         }
1176
1177
1178 }
1179 //**********************************************************************************************************************
1180
1181 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1182         try{
1183                 
1184 #ifdef USE_MPI
1185                 int pid;
1186                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1187                 
1188                 if (pid != 0) {
1189 #endif
1190                 
1191                 string thisOutputDir = outputDir;
1192                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1193                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
1194                 m->mothurRemove(outputFileName);
1195                 
1196                 
1197                 for (int i = 0; i < distNames.size(); i++) {
1198                         if (m->control_pressed) {  return 0; }
1199                         
1200                         string thisDistFile = distNames[i].begin()->first;
1201                         
1202                         m->appendFiles(thisDistFile, outputFileName);
1203                 }       
1204                         
1205                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1206                         
1207 #ifdef USE_MPI
1208                 }
1209 #endif
1210                                 
1211                 return 0;       
1212                 
1213                 
1214         }
1215         catch(exception& e) {
1216                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1217                 exit(1);
1218         }
1219 }
1220 //**********************************************************************************************************************