]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
Merge remote-tracking branch 'mothur/master'
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11
12
13 //**********************************************************************************************************************
14 vector<string> ClusterSplitCommand::setParameters(){    
15         try {
16                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
17                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
18                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
19                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
20                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
21                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
22                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
23                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
24                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
25                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
26                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
27                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff);
28                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
29                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
30                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
31         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pclassic);
32                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
33                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
34                         
35                 vector<string> myArray;
36                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
37                 return myArray;
38         }
39         catch(exception& e) {
40                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
41                 exit(1);
42         }
43 }
44 //**********************************************************************************************************************
45 string ClusterSplitCommand::getHelpString(){    
46         try {
47                 string helpString = "";
48                 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n";
49                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
50                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
51                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
52                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
53                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
54                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
55                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
56                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
57                 helpString += "The name parameter allows you to enter your name file and is required if your distance file is in column format. \n";
58                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
59                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
60                 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
61                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
62                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
63                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
64                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
65         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic.  It is only valid with splitmethod=fasta. Default=f.\n";
66 #ifdef USE_MPI
67                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
68 #endif
69                 helpString += "The cluster.split command should be in the following format: \n";
70                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
71                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";       
72                 return helpString;
73         }
74         catch(exception& e) {
75                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
76                 exit(1);
77         }
78 }
79 //**********************************************************************************************************************
80 ClusterSplitCommand::ClusterSplitCommand(){     
81         try {
82                 abort = true; calledHelp = true; 
83                 setParameters();
84                 vector<string> tempOutNames;
85                 outputTypes["list"] = tempOutNames;
86                 outputTypes["rabund"] = tempOutNames;
87                 outputTypes["sabund"] = tempOutNames;
88                 outputTypes["column"] = tempOutNames;
89         }
90         catch(exception& e) {
91                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
92                 exit(1);
93         }
94 }
95 //**********************************************************************************************************************
96 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
97 ClusterSplitCommand::ClusterSplitCommand(string option)  {
98         try{
99                 abort = false; calledHelp = false;   
100                 format = "";
101                 
102                 //allow user to run help
103                 if(option == "help") { help(); abort = true; calledHelp = true; }
104                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
105                 
106                 else {
107                         vector<string> myArray = setParameters();
108                         
109                         OptionParser parser(option);
110                         map<string,string> parameters = parser.getParameters();
111                         
112                         ValidParameters validParameter("cluster.split");
113                 
114                         //check to make sure all parameters are valid for command
115                         map<string,string>::iterator it;
116                         for (it = parameters.begin(); it != parameters.end(); it++) { 
117                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
118                                         abort = true;
119                                 }
120                         }
121                         
122                         //initialize outputTypes
123                         vector<string> tempOutNames;
124                         outputTypes["list"] = tempOutNames;
125                         outputTypes["rabund"] = tempOutNames;
126                         outputTypes["sabund"] = tempOutNames;
127                         outputTypes["column"] = tempOutNames;
128                         
129                         //if the user changes the output directory command factory will send this info to us in the output parameter 
130                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
131                         
132                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
133                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
134                         if (inputDir == "not found"){   inputDir = "";          }
135                         else {
136                                 string path;
137                                 it = parameters.find("phylip");
138                                 //user has given a template file
139                                 if(it != parameters.end()){ 
140                                         path = m->hasPath(it->second);
141                                         //if the user has not given a path then, add inputdir. else leave path alone.
142                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
143                                 }
144                                 
145                                 it = parameters.find("column");
146                                 //user has given a template file
147                                 if(it != parameters.end()){ 
148                                         path = m->hasPath(it->second);
149                                         //if the user has not given a path then, add inputdir. else leave path alone.
150                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
151                                 }
152                                 
153                                 it = parameters.find("name");
154                                 //user has given a template file
155                                 if(it != parameters.end()){ 
156                                         path = m->hasPath(it->second);
157                                         //if the user has not given a path then, add inputdir. else leave path alone.
158                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
159                                 }
160                                 
161                                 it = parameters.find("taxonomy");
162                                 //user has given a template file
163                                 if(it != parameters.end()){ 
164                                         path = m->hasPath(it->second);
165                                         //if the user has not given a path then, add inputdir. else leave path alone.
166                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
167                                 }
168                                 
169                                 it = parameters.find("fasta");
170                                 //user has given a template file
171                                 if(it != parameters.end()){ 
172                                         path = m->hasPath(it->second);
173                                         //if the user has not given a path then, add inputdir. else leave path alone.
174                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
175                                 }
176                         }
177                         
178                         //check for required parameters
179                         phylipfile = validParameter.validFile(parameters, "phylip", true);
180                         if (phylipfile == "not open") { abort = true; }
181                         else if (phylipfile == "not found") { phylipfile = ""; }        
182                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
183                         
184                         columnfile = validParameter.validFile(parameters, "column", true);
185                         if (columnfile == "not open") { abort = true; } 
186                         else if (columnfile == "not found") { columnfile = ""; }
187                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
188                         
189                         namefile = validParameter.validFile(parameters, "name", true);
190                         if (namefile == "not open") { abort = true; }   
191                         else if (namefile == "not found") { namefile = "";  }
192                         else { m->setNameFile(namefile); }
193                         
194                         fastafile = validParameter.validFile(parameters, "fasta", true);
195                         if (fastafile == "not open") { abort = true; }  
196                         else if (fastafile == "not found") { fastafile = ""; }
197                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
198                         
199                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
200                         if (taxFile == "not open") { taxFile = ""; abort = true; }      
201                         else if (taxFile == "not found") { taxFile = ""; }
202                         else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
203                         
204                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { 
205                                 //is there are current file available for either of these?
206                                 //give priority to column, then phylip, then fasta
207                                 columnfile = m->getColumnFile(); 
208                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
209                                 else { 
210                                         phylipfile = m->getPhylipFile(); 
211                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
212                                         else { 
213                                                 fastafile = m->getFastaFile(); 
214                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
215                                                 else { 
216                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); 
217                                                         abort = true; 
218                                                 }
219                                         }
220                                 }
221                         }
222                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
223                 
224                         if (columnfile != "") {
225                                 if (namefile == "") { 
226                                         namefile = m->getNameFile(); 
227                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
228                                         else { 
229                                                 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); 
230                                                 abort = true; 
231                                         }       
232                                 }
233                         }
234                         
235                         if (fastafile != "") {
236                                 if (taxFile == "") { 
237                                         taxFile = m->getTaxonomyFile(); 
238                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
239                                         else { 
240                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine(); 
241                                                 abort = true; 
242                                         }       
243                                 }
244                                 
245                                 if (namefile == "") { 
246                                         namefile = m->getNameFile(); 
247                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
248                                         else { 
249                                                 m->mothurOut("You need to provide a namefile if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine(); 
250                                                 abort = true; 
251                                         }       
252                                 }
253                         }
254                                         
255                         //check for optional parameter and set defaults
256                         // ...at some point should added some additional type checking...
257                         //get user cutoff and precision or use defaults
258                         string temp;
259                         temp = validParameter.validFile(parameters, "precision", false);
260                         if (temp == "not found") { temp = "100"; }
261                         //saves precision legnth for formatting below
262                         length = temp.length();
263                         m->mothurConvert(temp, precision); 
264                         
265                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
266                         hard = m->isTrue(temp);
267                         
268                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
269                         large = m->isTrue(temp);
270             
271                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
272                         m->setProcessors(temp);
273                         m->mothurConvert(temp, processors);
274                         
275                         temp = validParameter.validFile(parameters, "splitmethod", false);      
276                         if ((splitmethod != "fasta") && (splitmethod != "classify")) {
277                                 if (temp == "not found")  { splitmethod = "distance"; }
278                                 else {  splitmethod = temp; }
279                         }
280                         
281             temp = validParameter.validFile(parameters, "classic", false);                      if (temp == "not found") { temp = "F"; }
282                         classic = m->isTrue(temp);
283             
284             if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
285
286                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
287                         m->mothurConvert(temp, cutoff); 
288                         cutoff += (5 / (precision * 10.0));  
289                         
290                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
291                         m->mothurConvert(temp, taxLevelCutoff); 
292                         
293                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
294                         
295                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
296                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
297                         
298                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
299                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
300                         
301                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
302
303                         showabund = validParameter.validFile(parameters, "showabund", false);
304                         if (showabund == "not found") { showabund = "T"; }
305
306                         timing = validParameter.validFile(parameters, "timing", false);
307                         if (timing == "not found") { timing = "F"; }
308                         
309                 }
310         }
311         catch(exception& e) {
312                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
313                 exit(1);
314         }
315 }
316
317 //**********************************************************************************************************************
318
319 int ClusterSplitCommand::execute(){
320         try {
321         
322                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
323                 
324                 time_t estart;
325                 vector<string> listFileNames;
326                 set<string> labels;
327                 string singletonName = "";
328                 double saveCutoff = cutoff;
329
330                 //****************** file prep work ******************************//
331                 #ifdef USE_MPI
332                         int pid;
333                         int tag = 2001;
334                         MPI_Status status; 
335                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
336                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
337                         
338                         if (pid == 0) { //only process 0 converts and splits
339                         
340                 #endif
341                 
342                 //if user gave a phylip file convert to column file
343                 if (format == "phylip") {
344                         estart = time(NULL);
345                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
346                         
347                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
348                         
349                         NameAssignment* nameMap = NULL;
350                         convert->setFormat("phylip");
351                         convert->read(nameMap);
352                         
353                         if (m->control_pressed) {  delete convert;  return 0;  }
354                         
355                         distfile = convert->getOutputFile();
356                 
357                         //if no names file given with phylip file, create it
358                         ListVector* listToMakeNameFile =  convert->getListVector();
359                         if (namefile == "") {  //you need to make a namefile for split matrix
360                                 ofstream out;
361                                 namefile = phylipfile + ".names";
362                                 m->openOutputFile(namefile, out);
363                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
364                                         string bin = listToMakeNameFile->get(i);
365                                         out << bin << '\t' << bin << endl;
366                                 }
367                                 out.close();
368                         }
369                         delete listToMakeNameFile;
370                         delete convert;
371                         
372                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
373                 }
374                 if (m->control_pressed) { return 0; }
375                 
376                 estart = time(NULL);
377                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
378                 
379                 //split matrix into non-overlapping groups
380                 SplitMatrix* split;
381                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
382                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
383                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);     }
384                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
385                 
386                 split->split();
387                 
388                 if (m->control_pressed) { delete split; return 0; }
389                 
390                 singletonName = split->getSingletonNames();
391                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
392                 delete split;
393                 
394                 //output a merged distance file
395                 if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
396                         
397                                 
398                 if (m->control_pressed) { return 0; }
399                 
400                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
401                 estart = time(NULL);
402                 
403                 //****************** break up files between processes and cluster each file set ******************************//
404         #ifdef USE_MPI
405                         ////you are process 0 from above////
406                         
407                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...                           
408                         dividedNames.resize(processors);
409                                         
410                         //for each file group figure out which process will complete it
411                         //want to divide the load intelligently so the big files are spread between processes
412                         for (int i = 0; i < distName.size(); i++) { 
413                                 int processToAssign = (i+1) % processors; 
414                                 if (processToAssign == 0) { processToAssign = processors; }
415                                                 
416                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
417                         }
418                                         
419                         //not lets reverse the order of ever other process, so we balance big files running with little ones
420                         for (int i = 0; i < processors; i++) {
421                                 int remainder = ((i+1) % processors);
422                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
423                         }
424                         
425                         
426                         //send each child the list of files it needs to process
427                         for(int i = 1; i < processors; i++) { 
428                                 //send number of file pairs
429                                 int num = dividedNames[i].size();
430                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
431                                 
432                                 for (int j = 0; j < num; j++) { //send filenames to process i
433                                         char tempDistFileName[1024];
434                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
435                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
436                                         
437                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
438                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
439                                         
440                                         char tempNameFileName[1024];
441                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
442                                         int lengthName = (dividedNames[i][j].begin()->second).length();
443
444                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
445                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
446                                 }
447                         }
448                         
449                         //process your share
450                         listFileNames = cluster(dividedNames[0], labels);
451                         
452                         //receive the other processes info
453                         for(int i = 1; i < processors; i++) { 
454                                 int num = dividedNames[i].size();
455                                 
456                                 double tempCutoff;
457                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
458                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
459                                 
460                                 //send list filenames to root process
461                                 for (int j = 0; j < num; j++) {  
462                                         int lengthList = 0;
463                                         char tempListFileName[1024];
464                                 
465                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
466                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
467                                 
468                                         string myListFileName = tempListFileName;
469                                         myListFileName = myListFileName.substr(0, lengthList);
470                                         
471                                         listFileNames.push_back(myListFileName);
472                                 }
473                                 
474                                 //send Labels to root process
475                                 int numLabels = 0;
476                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
477                                 
478                                 for (int j = 0; j < numLabels; j++) {  
479                                         int lengthLabel = 0;
480                                         char tempLabel[100];
481                                 
482                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
483                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
484                                 
485                                         string myLabel = tempLabel;
486                                         myLabel = myLabel.substr(0, lengthLabel);
487                         
488                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
489                                 }
490                         }
491                         
492                 }else { //you are a child process
493                         vector < map<string, string> >  myNames;
494                         
495                         //recieve the files you need to process
496                         //receive number of file pairs
497                         int num = 0;
498                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
499                         
500                         myNames.resize(num);
501         
502                         for (int j = 0; j < num; j++) { //receive filenames to process 
503                                 int lengthDist = 0;
504                                 char tempDistFileName[1024];
505                                 
506                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
507                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
508                                 
509                                 string myDistFileName = tempDistFileName;
510                                 myDistFileName = myDistFileName.substr(0, lengthDist);
511                         
512                                 int lengthName = 0;
513                                 char tempNameFileName[1024];
514                                 
515                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
516                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
517                                 
518                                 string myNameFileName = tempNameFileName;
519                                 myNameFileName = myNameFileName.substr(0, lengthName);
520                                 
521                                 //save file name
522                                 myNames[j][myDistFileName] = myNameFileName;
523                         }
524         
525                         //process them
526                         listFileNames = cluster(myNames, labels);
527                         
528                         //send cutoff
529                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
530                         
531                         //send list filenames to root process
532                         for (int j = 0; j < num; j++) {  
533                                 char tempListFileName[1024];
534                                 strcpy(tempListFileName, listFileNames[j].c_str());
535                                 int lengthList = listFileNames[j].length();
536                                         
537                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
538                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
539                         }
540                         
541                         //send Labels to root process
542                         int numLabels = labels.size();
543                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
544                         
545                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
546                                 char tempLabel[100];
547                                 strcpy(tempLabel, (*it).c_str());
548                                 int lengthLabel = (*it).length();
549                                         
550                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
551                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
552                         }
553                 }
554                 
555                 //make everyone wait
556                 MPI_Barrier(MPI_COMM_WORLD);
557                 
558         #else
559                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
560                 //sanity check
561                 if (processors > distName.size()) { processors = distName.size(); }
562                 
563                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
564                                 if(processors == 1){
565                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
566                                 }else{
567                                         listFileNames = createProcesses(distName, labels);
568                 }
569                 #else
570                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
571                 #endif
572         #endif  
573                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
574                 
575                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
576                 
577                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
578                 
579                 //****************** merge list file and create rabund and sabund files ******************************//
580                 estart = time(NULL);
581                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
582                 
583                 #ifdef USE_MPI
584                         if (pid == 0) { //only process 0 merges
585                 #endif
586
587                 ListVector* listSingle;
588                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
589                 
590                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
591                 
592                 mergeLists(listFileNames, labelBins, listSingle);
593
594                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
595                 
596                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
597                 
598                 //set list file as new current listfile
599                 string current = "";
600                 itTypes = outputTypes.find("list");
601                 if (itTypes != outputTypes.end()) {
602                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
603                 }
604                 
605                 //set rabund file as new current rabundfile
606                 itTypes = outputTypes.find("rabund");
607                 if (itTypes != outputTypes.end()) {
608                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
609                 }
610                 
611                 //set sabund file as new current sabundfile
612                 itTypes = outputTypes.find("sabund");
613                 if (itTypes != outputTypes.end()) {
614                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
615                 }
616                                 
617                 m->mothurOutEndLine();
618                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
619                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
620                 m->mothurOutEndLine();
621                 
622                 #ifdef USE_MPI
623                         } //only process 0 merges
624                         
625                         //make everyone wait
626                         MPI_Barrier(MPI_COMM_WORLD);
627                 #endif
628
629                 return 0;
630         }
631         catch(exception& e) {
632                 m->errorOut(e, "ClusterSplitCommand", "execute");
633                 exit(1);
634         }
635 }
636 //**********************************************************************************************************************
637 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
638         try {
639                                 
640                 map<float, int> labelBin;
641                 vector<float> orderFloat;
642                 int numSingleBins;
643                 
644                 //read in singletons
645                 if (singleton != "none") {
646                         ifstream in;
647                         m->openInputFile(singleton, in);
648                                 
649                         string firstCol, secondCol;
650                         listSingle = new ListVector();
651                         while (!in.eof()) {
652                                 in >> firstCol >> secondCol; m->gobble(in);
653                                 listSingle->push_back(secondCol);
654                         }
655                         in.close();
656                         m->mothurRemove(singleton);
657                         
658                         numSingleBins = listSingle->getNumBins();
659                 }else{  listSingle = NULL; numSingleBins = 0;  }
660                 
661                 //go through users set and make them floats so we can sort them 
662                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
663                         float temp = -10.0;
664
665                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
666                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
667                         
668                         if (temp <= cutoff) {
669                                 orderFloat.push_back(temp);
670                                 labelBin[temp] = numSingleBins; //initialize numbins 
671                         }
672                 }
673         
674                 //sort order
675                 sort(orderFloat.begin(), orderFloat.end());
676                 userLabels.clear();
677                         
678                 //get the list info from each file
679                 for (int k = 0; k < listNames.size(); k++) {
680         
681                         if (m->control_pressed) {  
682                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
683                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
684                                 return labelBin;
685                         }
686                         
687                         InputData* input = new InputData(listNames[k], "list");
688                         ListVector* list = input->getListVector();
689                         string lastLabel = list->getLabel();
690                         
691                         string filledInList = listNames[k] + "filledInTemp";
692                         ofstream outFilled;
693                         m->openOutputFile(filledInList, outFilled);
694         
695                         //for each label needed
696                         for(int l = 0; l < orderFloat.size(); l++){
697                         
698                                 string thisLabel;
699                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
700                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
701
702                                 //this file has reached the end
703                                 if (list == NULL) { 
704                                         list = input->getListVector(lastLabel, true); 
705                                 }else{  //do you have the distance, or do you need to fill in
706                                                 
707                                         float labelFloat;
708                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
709                                         else { convert(list->getLabel(), labelFloat); }
710
711                                         //check for missing labels
712                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
713                                                 //if its bigger get last label, otherwise keep it
714                                                 delete list;
715                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
716                                         }
717                                         lastLabel = list->getLabel();
718                                 }
719                                 
720                                 //print to new file
721                                 list->setLabel(thisLabel);
722                                 list->print(outFilled);
723                 
724                                 //update labelBin
725                                 labelBin[orderFloat[l]] += list->getNumBins();
726                                                                         
727                                 delete list;
728                                                                         
729                                 list = input->getListVector();
730                         }
731                         
732                         if (list != NULL) { delete list; }
733                         delete input;
734                         
735                         outFilled.close();
736                         m->mothurRemove(listNames[k]);
737                         rename(filledInList.c_str(), listNames[k].c_str());
738                 }
739                 
740                 return labelBin;
741         }
742         catch(exception& e) {
743                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
744                 exit(1);
745         }
746 }
747 //**********************************************************************************************************************
748 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
749         try {
750                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
751                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
752                 
753                 m->openOutputFile(fileroot+ tag + ".sabund",    outSabund);
754                 m->openOutputFile(fileroot+ tag + ".rabund",    outRabund);
755                 m->openOutputFile(fileroot+ tag + ".list",              outList);
756                                 
757                 outputNames.push_back(fileroot+ tag + ".sabund");  outputTypes["list"].push_back(fileroot+ tag + ".list");
758                 outputNames.push_back(fileroot+ tag + ".rabund");  outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
759                 outputNames.push_back(fileroot+ tag + ".list");    outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
760                 
761                 map<float, int>::iterator itLabel;
762
763                 //for each label needed
764                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
765                         
766                         string thisLabel;
767                         if (itLabel->first == -1) { thisLabel = "unique"; }
768                         else { thisLabel = toString(itLabel->first,  length-1);  } 
769                         
770                         outList << thisLabel << '\t' << itLabel->second << '\t';
771
772                         RAbundVector* rabund = new RAbundVector();
773                         rabund->setLabel(thisLabel);
774
775                         //add in singletons
776                         if (listSingle != NULL) {
777                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
778                                         outList << listSingle->get(j) << '\t';
779                                         rabund->push_back(m->getNumNames(listSingle->get(j)));
780                                 }
781                         }
782                         
783                         //get the list info from each file
784                         for (int k = 0; k < listNames.size(); k++) {
785         
786                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } delete rabund; return 0; }
787                                 
788                                 InputData* input = new InputData(listNames[k], "list");
789                                 ListVector* list = input->getListVector(thisLabel);
790                                 
791                                 //this file has reached the end
792                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
793                                 else {          
794                                         for (int j = 0; j < list->getNumBins(); j++) {
795                                                 outList << list->get(j) << '\t';
796                                                 rabund->push_back(m->getNumNames(list->get(j)));
797                                         }
798                                         delete list;
799                                 }
800                                 delete input;
801                         }
802                         
803                         SAbundVector sabund = rabund->getSAbundVector();
804                         
805                         sabund.print(outSabund);
806                         rabund->print(outRabund);
807                         outList << endl;
808                         
809                         delete rabund;
810                 }
811                 
812                 outList.close();
813                 outRabund.close();
814                 outSabund.close();
815                 
816                 if (listSingle != NULL) { delete listSingle;  }
817                 
818                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
819                 
820                 return 0;
821         }
822         catch(exception& e) {
823                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
824                 exit(1);
825         }
826 }
827
828 //**********************************************************************************************************************
829
830 void ClusterSplitCommand::printData(ListVector* oldList){
831         try {
832                 string label = oldList->getLabel();
833                 RAbundVector oldRAbund = oldList->getRAbundVector();
834                 
835                 oldRAbund.setLabel(label);
836                 if (m->isTrue(showabund)) {
837                         oldRAbund.getSAbundVector().print(cout);
838                 }
839                 oldRAbund.print(outRabund);
840                 oldRAbund.getSAbundVector().print(outSabund);
841         
842                 oldList->print(outList);
843         }
844         catch(exception& e) {
845                 m->errorOut(e, "ClusterSplitCommand", "printData");
846                 exit(1);
847         }
848 }
849 //**********************************************************************************************************************
850 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
851         try {
852         
853         vector<string> listFiles;
854         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
855         dividedNames.resize(processors);
856         
857         //for each file group figure out which process will complete it
858         //want to divide the load intelligently so the big files are spread between processes
859         for (int i = 0; i < distName.size(); i++) { 
860             //cout << i << endl;
861             int processToAssign = (i+1) % processors; 
862             if (processToAssign == 0) { processToAssign = processors; }
863             
864             dividedNames[(processToAssign-1)].push_back(distName[i]);
865             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
866         }
867         
868         //not lets reverse the order of ever other process, so we balance big files running with little ones
869         for (int i = 0; i < processors; i++) {
870             //cout << i << endl;
871             int remainder = ((i+1) % processors);
872             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
873         }
874         
875         if (m->control_pressed) { return listFiles; }
876         
877         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
878                 int process = 1;
879                 processIDS.clear();
880                 
881                 //loop through and create all the processes you want
882                 while (process != processors) {
883                         int pid = fork();
884                         
885                         if (pid > 0) {
886                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
887                                 process++;
888                         }else if (pid == 0){
889                                 set<string> labels;
890                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
891                                 
892                                 //write out names to file
893                                 string filename = toString(getpid()) + ".temp";
894                                 ofstream out;
895                                 m->openOutputFile(filename, out);
896                                 out << tag << endl;
897                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
898                                 out.close();
899                                 
900                                 //print out labels
901                                 ofstream outLabels;
902                                 filename = toString(getpid()) + ".temp.labels";
903                                 m->openOutputFile(filename, outLabels);
904                                 
905                                 outLabels << cutoff << endl;
906                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
907                                         outLabels << (*it) << endl;
908                                 }
909                                 outLabels.close();
910
911                                 exit(0);
912                         }else { 
913                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
914                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
915                                 exit(0);
916                         }
917                 }
918                 
919         //do your part
920         listFiles = cluster(dividedNames[0], labels);
921         
922                 //force parent to wait until all the processes are done
923                 for (int i=0;i< processIDS.size();i++) { 
924                         int temp = processIDS[i];
925                         wait(&temp);
926                 }
927         
928         //get list of list file names from each process
929         for(int i=0;i<processIDS.size();i++){
930             string filename = toString(processIDS[i]) + ".temp";
931             ifstream in;
932             m->openInputFile(filename, in);
933             
934             in >> tag; m->gobble(in);
935             
936             while(!in.eof()) {
937                 string tempName;
938                 in >> tempName; m->gobble(in);
939                 listFiles.push_back(tempName);
940             }
941             in.close();
942             m->mothurRemove((toString(processIDS[i]) + ".temp"));
943             
944             //get labels
945             filename = toString(processIDS[i]) + ".temp.labels";
946             ifstream in2;
947             m->openInputFile(filename, in2);
948             
949             float tempCutoff;
950             in2 >> tempCutoff; m->gobble(in2);
951             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
952             
953             while(!in2.eof()) {
954                 string tempName;
955                 in2 >> tempName; m->gobble(in2);
956                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
957             }
958             in2.close();
959             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
960         }
961         
962
963     #else
964        
965         //////////////////////////////////////////////////////////////////////////////////////////////////////
966                 //Windows version shared memory, so be careful when passing variables through the clusterData struct. 
967                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
968                 //Taking advantage of shared memory to allow both threads to add labels.
969                 //////////////////////////////////////////////////////////////////////////////////////////////////////
970                 
971                 vector<clusterData*> pDataArray; 
972                 DWORD   dwThreadIdArray[processors-1];
973                 HANDLE  hThreadArray[processors-1]; 
974                 
975                 //Create processor worker threads.
976                 for( int i=1; i<processors; i++ ){
977                         // Allocate memory for thread data.
978                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
979                         pDataArray.push_back(tempCluster);
980                         processIDS.push_back(i);
981             
982                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
983                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
984                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);  
985             
986                 }
987         
988         //do your part
989         listFiles = cluster(dividedNames[0], labels);
990         
991                 //Wait until all threads have terminated.
992                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
993                 
994                 //Close all thread handles and free memory allocations.
995                 for(int i=0; i < pDataArray.size(); i++){
996             //get tag
997             tag = pDataArray[i]->tag;
998             //get listfiles created
999             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1000             //get labels
1001             set<string>::iterator it;
1002             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1003                         //check cutoff
1004             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1005                         CloseHandle(hThreadArray[i]);
1006                         delete pDataArray[i];
1007                 }
1008
1009         #endif          
1010         
1011         return listFiles;
1012         
1013         }
1014         catch(exception& e) {
1015                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1016                 exit(1);
1017         }
1018 }
1019 //**********************************************************************************************************************
1020
1021 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1022         try {
1023                 
1024                 vector<string> listFileNames;
1025                 double smallestCutoff = cutoff;
1026                 
1027                 //cluster each distance file
1028                 for (int i = 0; i < distNames.size(); i++) {
1029             
1030                         string thisNamefile = distNames[i].begin()->second;
1031                         string thisDistFile = distNames[i].begin()->first;
1032                         
1033                         string listFileName = "";
1034             if (classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff);   }
1035             else            {  listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff);          }
1036
1037                         if (m->control_pressed) { //clean up
1038                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1039                                 listFileNames.clear(); return listFileNames;
1040                         }
1041             
1042             listFileNames.push_back(listFileName);
1043         }
1044                 
1045                 cutoff = smallestCutoff;
1046                                         
1047                 return listFileNames;
1048         
1049         }
1050         catch(exception& e) {
1051                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1052                 exit(1);
1053         }
1054
1055
1056 }
1057 //**********************************************************************************************************************
1058 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1059         try {
1060         string listFileName = "";
1061         
1062         ListVector* list = NULL;
1063         ListVector oldList;
1064         RAbundVector* rabund = NULL;
1065         
1066 #ifdef USE_MPI
1067         int pid;
1068         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1069         
1070         //output your files too
1071         if (pid != 0) {
1072             cout << endl << "Reading " << thisDistFile << endl;
1073         }
1074 #endif
1075
1076         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1077         
1078         NameAssignment* nameMap = new NameAssignment(thisNamefile);
1079         nameMap->readMap();
1080                                 
1081                 //reads phylip file storing data in 2D vector, also fills list and rabund
1082         bool sim = false;
1083                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1084                 cluster->readPhylipFile(thisDistFile, nameMap);
1085                 tag = cluster->getTag();
1086         
1087                 if (m->control_pressed) { delete cluster; return 0; }
1088                 
1089                 list = cluster->getListVector();
1090                 rabund = cluster->getRAbundVector();
1091         
1092                 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1093                 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1094         listFileName = fileroot+ tag + ".list";
1095         
1096         ofstream listFile;
1097                 m->openOutputFile(fileroot+ tag + ".list",      listFile);
1098                 
1099                 float previousDist = 0.00000;
1100                 float rndPreviousDist = 0.00000;
1101                 oldList = *list;
1102                 
1103 #ifdef USE_MPI
1104         //output your files too
1105         if (pid != 0) {
1106             cout << endl << "Clustering " << thisDistFile << endl;
1107         }
1108 #endif
1109
1110         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1111         
1112                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1113                         if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  return listFileName;  }
1114             
1115                         cluster->update(cutoff);
1116             
1117                         float dist = cluster->getSmallDist();
1118                         float rndDist;
1119                         if (hard) {
1120                                 rndDist = m->ceilDist(dist, precision); 
1121                         }else{
1122                                 rndDist = m->roundDist(dist, precision); 
1123                         }
1124             
1125             if(previousDist <= 0.0000 && dist != previousDist){
1126                 oldList.setLabel("unique");
1127                 oldList.print(listFile);
1128                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1129             }
1130             else if(rndDist != rndPreviousDist){
1131                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1132                 oldList.print(listFile);
1133                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1134             }
1135
1136             
1137                         previousDist = dist;
1138                         rndPreviousDist = rndDist;
1139                         oldList = *list;
1140                 }
1141         
1142                 if(previousDist <= 0.0000){
1143             oldList.setLabel("unique");
1144             oldList.print(listFile);
1145             if (labels.count("unique") == 0) { labels.insert("unique"); }
1146         }
1147         else if(rndPreviousDist<cutoff){
1148             oldList.setLabel(toString(rndPreviousDist,  length-1));
1149             oldList.print(listFile);
1150             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1151         }
1152
1153         
1154                 listFile.close();
1155                 
1156                 delete cluster; delete nameMap; delete list; delete rabund;
1157
1158         
1159         return listFileName;
1160         
1161         }
1162         catch(exception& e) {
1163                 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1164                 exit(1);
1165         }
1166 }
1167
1168 //**********************************************************************************************************************
1169 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1170         try {
1171         string listFileName = "";
1172         
1173         Cluster* cluster = NULL;
1174         SparseMatrix* matrix = NULL;
1175         ListVector* list = NULL;
1176         ListVector oldList;
1177         RAbundVector* rabund = NULL;
1178         
1179         if (m->control_pressed) { return listFileName; }
1180         
1181 #ifdef USE_MPI
1182         int pid;
1183         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1184         
1185         //output your files too
1186         if (pid != 0) {
1187             cout << endl << "Reading " << thisDistFile << endl;
1188         }
1189 #endif
1190         
1191         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1192         
1193         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
1194         read->setCutoff(cutoff);
1195         
1196         NameAssignment* nameMap = new NameAssignment(thisNamefile);
1197         nameMap->readMap();
1198         read->read(nameMap);
1199         
1200         if (m->control_pressed) {  delete read; delete nameMap; return listFileName; }
1201         
1202         list = read->getListVector();
1203         oldList = *list;
1204         matrix = read->getMatrix();
1205         
1206         delete read;  read = NULL;
1207         delete nameMap; nameMap = NULL;
1208         
1209         
1210 #ifdef USE_MPI
1211         //output your files too
1212         if (pid != 0) {
1213             cout << endl << "Clustering " << thisDistFile << endl;
1214         }
1215 #endif
1216         
1217         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1218                 
1219         rabund = new RAbundVector(list->getRAbundVector());
1220         
1221         //create cluster
1222         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1223         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1224         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
1225         tag = cluster->getTag();
1226                 
1227         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1228         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1229         
1230         ofstream listFile;
1231         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1232                 
1233         listFileName = fileroot+ tag + ".list";
1234         
1235         float previousDist = 0.00000;
1236         float rndPreviousDist = 0.00000;
1237         
1238         oldList = *list;
1239         
1240         print_start = true;
1241         start = time(NULL);
1242         double saveCutoff = cutoff;
1243                 
1244         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1245             
1246             if (m->control_pressed) { //clean up
1247                 delete matrix; delete list;     delete cluster; delete rabund;
1248                 listFile.close();
1249                 m->mothurRemove(listFileName);  
1250                 return listFileName;
1251             }
1252             
1253             cluster->update(saveCutoff);
1254             
1255             float dist = matrix->getSmallDist();
1256             float rndDist;
1257             if (hard) {
1258                 rndDist = m->ceilDist(dist, precision); 
1259             }else{
1260                 rndDist = m->roundDist(dist, precision); 
1261             }
1262             
1263             if(previousDist <= 0.0000 && dist != previousDist){
1264                 oldList.setLabel("unique");
1265                 oldList.print(listFile);
1266                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1267             }
1268             else if(rndDist != rndPreviousDist){
1269                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1270                 oldList.print(listFile);
1271                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1272             }
1273             
1274             previousDist = dist;
1275             rndPreviousDist = rndDist;
1276             oldList = *list;
1277         }
1278         
1279                 
1280         if(previousDist <= 0.0000){
1281             oldList.setLabel("unique");
1282             oldList.print(listFile);
1283             if (labels.count("unique") == 0) { labels.insert("unique"); }
1284         }
1285         else if(rndPreviousDist<cutoff){
1286             oldList.setLabel(toString(rndPreviousDist,  length-1));
1287             oldList.print(listFile);
1288             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1289         }
1290         
1291         delete matrix; delete list;     delete cluster; delete rabund; 
1292         matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1293         listFile.close();
1294         
1295         if (m->control_pressed) { //clean up
1296             m->mothurRemove(listFileName);      
1297             return listFileName;
1298         }
1299         
1300         m->mothurRemove(thisDistFile);
1301         m->mothurRemove(thisNamefile);
1302         
1303         if (saveCutoff != cutoff) { 
1304             if (hard)   {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1305             else                {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1306                         
1307             m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  
1308         }
1309         
1310         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1311         
1312         return listFileName;
1313         
1314         }
1315         catch(exception& e) {
1316                 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1317                 exit(1);
1318         }
1319 }
1320 //**********************************************************************************************************************
1321
1322 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1323         try{
1324                 
1325 #ifdef USE_MPI
1326                 int pid;
1327                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1328                 
1329                 if (pid != 0) {
1330 #endif
1331                 
1332                 string thisOutputDir = outputDir;
1333                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1334                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
1335                 m->mothurRemove(outputFileName);
1336                 
1337                 
1338                 for (int i = 0; i < distNames.size(); i++) {
1339                         if (m->control_pressed) {  return 0; }
1340                         
1341                         string thisDistFile = distNames[i].begin()->first;
1342                         
1343                         m->appendFiles(thisDistFile, outputFileName);
1344                 }       
1345                         
1346                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1347                         
1348 #ifdef USE_MPI
1349                 }
1350 #endif
1351                                 
1352                 return 0;       
1353                 
1354                 
1355         }
1356         catch(exception& e) {
1357                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1358                 exit(1);
1359         }
1360 }
1361 //**********************************************************************************************************************