]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
Merge remote-tracking branch 'origin/master'
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11
12
13 //**********************************************************************************************************************
14 vector<string> ClusterSplitCommand::setParameters(){    
15         try {
16                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
17                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
18                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
19                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
20         CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
21                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
22                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
23                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "","",false,false,true); parameters.push_back(psplitmethod);
24                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
25                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
26         CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
27                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
28                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
29                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff);
30                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
31                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false); parameters.push_back(pmethod);
32                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
33         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
34                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
35                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
36                         
37                 vector<string> myArray;
38                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
39                 return myArray;
40         }
41         catch(exception& e) {
42                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
43                 exit(1);
44         }
45 }
46 //**********************************************************************************************************************
47 string ClusterSplitCommand::getHelpString(){    
48         try {
49                 string helpString = "";
50                 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
51                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
52                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
53                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
54                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
55                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
56                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
57                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
58                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
59                 helpString += "The name parameter allows you to enter your name file. \n";
60         helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
61         helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
62                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
63                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
64                 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
65                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
66                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
67                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
68                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
69         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic.  It is only valid with splitmethod=fasta. Default=f.\n";
70 #ifdef USE_MPI
71                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
72 #endif
73                 helpString += "The cluster.split command should be in the following format: \n";
74                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
75                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";       
76                 return helpString;
77         }
78         catch(exception& e) {
79                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
80                 exit(1);
81         }
82 }
83 //**********************************************************************************************************************
84 string ClusterSplitCommand::getOutputPattern(string type) {
85     try {
86         string pattern = "";
87         
88         if (type == "list") {  pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; } 
89         else if (type == "rabund") {  pattern = "[filename],[clustertag],rabund"; } 
90         else if (type == "sabund") {  pattern = "[filename],[clustertag],sabund"; }
91         else if (type == "column") {  pattern = "[filename],dist"; }
92         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
93         
94         return pattern;
95     }
96     catch(exception& e) {
97         m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
98         exit(1);
99     }
100 }
101 //**********************************************************************************************************************
102 ClusterSplitCommand::ClusterSplitCommand(){     
103         try {
104                 abort = true; calledHelp = true; 
105                 setParameters();
106                 vector<string> tempOutNames;
107                 outputTypes["list"] = tempOutNames;
108                 outputTypes["rabund"] = tempOutNames;
109                 outputTypes["sabund"] = tempOutNames;
110                 outputTypes["column"] = tempOutNames;
111         }
112         catch(exception& e) {
113                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
114                 exit(1);
115         }
116 }
117 //**********************************************************************************************************************
118 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
119 ClusterSplitCommand::ClusterSplitCommand(string option)  {
120         try{
121                 abort = false; calledHelp = false;   
122                 format = "";
123                 
124                 //allow user to run help
125                 if(option == "help") { help(); abort = true; calledHelp = true; }
126                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
127                 
128                 else {
129                         vector<string> myArray = setParameters();
130                         
131                         OptionParser parser(option);
132                         map<string,string> parameters = parser.getParameters();
133                         
134                         ValidParameters validParameter("cluster.split");
135                 
136                         //check to make sure all parameters are valid for command
137                         map<string,string>::iterator it;
138                         for (it = parameters.begin(); it != parameters.end(); it++) { 
139                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
140                                         abort = true;
141                                 }
142                         }
143                         
144                         //initialize outputTypes
145                         vector<string> tempOutNames;
146                         outputTypes["list"] = tempOutNames;
147                         outputTypes["rabund"] = tempOutNames;
148                         outputTypes["sabund"] = tempOutNames;
149                         outputTypes["column"] = tempOutNames;
150                         
151                         //if the user changes the output directory command factory will send this info to us in the output parameter 
152                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
153                         
154                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
155                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
156                         if (inputDir == "not found"){   inputDir = "";          }
157                         else {
158                                 string path;
159                                 it = parameters.find("phylip");
160                                 //user has given a template file
161                                 if(it != parameters.end()){ 
162                                         path = m->hasPath(it->second);
163                                         //if the user has not given a path then, add inputdir. else leave path alone.
164                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
165                                 }
166                                 
167                                 it = parameters.find("column");
168                                 //user has given a template file
169                                 if(it != parameters.end()){ 
170                                         path = m->hasPath(it->second);
171                                         //if the user has not given a path then, add inputdir. else leave path alone.
172                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
173                                 }
174                                 
175                                 it = parameters.find("name");
176                                 //user has given a template file
177                                 if(it != parameters.end()){ 
178                                         path = m->hasPath(it->second);
179                                         //if the user has not given a path then, add inputdir. else leave path alone.
180                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
181                                 }
182                                 
183                                 it = parameters.find("taxonomy");
184                                 //user has given a template file
185                                 if(it != parameters.end()){ 
186                                         path = m->hasPath(it->second);
187                                         //if the user has not given a path then, add inputdir. else leave path alone.
188                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
189                                 }
190                                 
191                                 it = parameters.find("fasta");
192                                 //user has given a template file
193                                 if(it != parameters.end()){ 
194                                         path = m->hasPath(it->second);
195                                         //if the user has not given a path then, add inputdir. else leave path alone.
196                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
197                                 }
198                 
199                 it = parameters.find("count");
200                                 //user has given a template file
201                                 if(it != parameters.end()){ 
202                                         path = m->hasPath(it->second);
203                                         //if the user has not given a path then, add inputdir. else leave path alone.
204                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
205                                 }
206                         }
207                         
208                         //check for required parameters
209                         phylipfile = validParameter.validFile(parameters, "phylip", true);
210                         if (phylipfile == "not open") { abort = true; }
211                         else if (phylipfile == "not found") { phylipfile = ""; }        
212                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
213                         
214                         columnfile = validParameter.validFile(parameters, "column", true);
215                         if (columnfile == "not open") { abort = true; } 
216                         else if (columnfile == "not found") { columnfile = ""; }
217                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
218                         
219                         namefile = validParameter.validFile(parameters, "name", true);
220                         if (namefile == "not open") { abort = true; namefile = "";}     
221                         else if (namefile == "not found") { namefile = "";  }
222                         else { m->setNameFile(namefile); }
223             
224             countfile = validParameter.validFile(parameters, "count", true);
225                         if (countfile == "not open") { abort = true; countfile = "";}   
226                         else if (countfile == "not found") { countfile = "";  }
227                         else { m->setCountTableFile(countfile); }
228                         
229                         fastafile = validParameter.validFile(parameters, "fasta", true);
230                         if (fastafile == "not open") { abort = true; }  
231                         else if (fastafile == "not found") { fastafile = ""; }
232                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
233                         
234                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
235                         if (taxFile == "not open") { taxFile = ""; abort = true; }      
236                         else if (taxFile == "not found") { taxFile = ""; }
237                         else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
238                         
239                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { 
240                                 //is there are current file available for either of these?
241                                 //give priority to column, then phylip, then fasta
242                                 columnfile = m->getColumnFile(); 
243                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
244                                 else { 
245                                         phylipfile = m->getPhylipFile(); 
246                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
247                                         else { 
248                                                 fastafile = m->getFastaFile(); 
249                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
250                                                 else { 
251                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); 
252                                                         abort = true; 
253                                                 }
254                                         }
255                                 }
256                         }
257                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
258             
259             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
260             
261                         if (columnfile != "") {
262                                 if ((namefile == "") && (countfile == "")) { 
263                                         namefile = m->getNameFile(); 
264                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
265                                         else { 
266                                                 countfile = m->getCountTableFile();
267                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
268                         else { 
269                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine(); 
270                             abort = true; 
271                         }       
272                                         }       
273                                 }
274                         }
275                         
276                         if (fastafile != "") {
277                                 if (taxFile == "") { 
278                                         taxFile = m->getTaxonomyFile(); 
279                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
280                                         else { 
281                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine(); 
282                                                 abort = true; 
283                                         }       
284                                 }
285                                 
286                                 if ((namefile == "") && (countfile == "")) { 
287                                         namefile = m->getNameFile(); 
288                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
289                                         else { 
290                                                 countfile = m->getCountTableFile();
291                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
292                         else { 
293                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine(); 
294                             abort = true; 
295                         }       
296                                         }       
297                                 }
298                         }
299                                         
300                         //check for optional parameter and set defaults
301                         // ...at some point should added some additional type checking...
302                         //get user cutoff and precision or use defaults
303                         string temp;
304                         temp = validParameter.validFile(parameters, "precision", false);
305                         if (temp == "not found") { temp = "100"; }
306                         //saves precision legnth for formatting below
307                         length = temp.length();
308                         m->mothurConvert(temp, precision); 
309                         
310                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
311                         hard = m->isTrue(temp);
312                         
313                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
314                         large = m->isTrue(temp);
315             
316                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
317                         m->setProcessors(temp);
318                         m->mothurConvert(temp, processors);
319                         
320                         temp = validParameter.validFile(parameters, "splitmethod", false);      
321                         if ((splitmethod != "fasta") && (splitmethod != "classify")) {
322                                 if (temp == "not found")  { splitmethod = "distance"; }
323                                 else {  splitmethod = temp; }
324                         }
325                         
326             temp = validParameter.validFile(parameters, "classic", false);                      if (temp == "not found") { temp = "F"; }
327                         classic = m->isTrue(temp);
328             
329             if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
330
331                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
332                         m->mothurConvert(temp, cutoff); 
333                         cutoff += (5 / (precision * 10.0));  
334                         
335                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
336                         m->mothurConvert(temp, taxLevelCutoff); 
337                         
338                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
339                         
340                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
341                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
342                         
343                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
344                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
345                         
346                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
347
348                         showabund = validParameter.validFile(parameters, "showabund", false);
349                         if (showabund == "not found") { showabund = "T"; }
350             
351             temp = validParameter.validFile(parameters, "cluster", false);  if (temp == "not found") { temp = "T"; }
352             runCluster = m->isTrue(temp);
353
354                         timing = validParameter.validFile(parameters, "timing", false);
355                         if (timing == "not found") { timing = "F"; }
356                         
357                 }
358         }
359         catch(exception& e) {
360                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
361                 exit(1);
362         }
363 }
364
365 //**********************************************************************************************************************
366
367 int ClusterSplitCommand::execute(){
368         try {
369         
370                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
371                 
372                 time_t estart;
373                 vector<string> listFileNames;
374                 set<string> labels;
375                 string singletonName = "";
376                 double saveCutoff = cutoff;
377
378                 //****************** file prep work ******************************//
379                 #ifdef USE_MPI
380                         int pid;
381                         int tag = 2001;
382                         MPI_Status status; 
383                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
384                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
385                         
386                         if (pid == 0) { //only process 0 converts and splits
387                         
388                 #endif
389                 
390                 //if user gave a phylip file convert to column file
391                 if (format == "phylip") {
392                         estart = time(NULL);
393                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
394                         
395                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
396                         
397                         NameAssignment* nameMap = NULL;
398                         convert->setFormat("phylip");
399                         convert->read(nameMap);
400                         
401                         if (m->control_pressed) {  delete convert;  return 0;  }
402                         
403                         distfile = convert->getOutputFile();
404                 
405                         //if no names file given with phylip file, create it
406                         ListVector* listToMakeNameFile =  convert->getListVector();
407                         if ((namefile == "") && (countfile == "")) {  //you need to make a namefile for split matrix
408                                 ofstream out;
409                                 namefile = phylipfile + ".names";
410                                 m->openOutputFile(namefile, out);
411                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
412                                         string bin = listToMakeNameFile->get(i);
413                                         out << bin << '\t' << bin << endl;
414                                 }
415                                 out.close();
416                         }
417                         delete listToMakeNameFile;
418                         delete convert;
419                         
420                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
421                 }
422                 if (m->control_pressed) { return 0; }
423                 
424                 estart = time(NULL);
425                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
426                 
427                 //split matrix into non-overlapping groups
428                 SplitMatrix* split;
429                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large);                                                    }
430                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large);                                    }
431                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);  }
432                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
433                 
434                 split->split();
435                 
436                 if (m->control_pressed) { delete split; return 0; }
437                 
438                 singletonName = split->getSingletonNames();
439                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
440                 delete split;
441                 
442                 //output a merged distance file
443                 if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
444                         
445                                 
446                 if (m->control_pressed) { return 0; }
447                 
448                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
449                 estart = time(NULL);
450                 
451         if (!runCluster) { 
452 #ifdef USE_MPI
453     }
454 #endif  
455                 m->mothurOutEndLine();
456                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
457                 for (int i = 0; i < distName.size(); i++) {     m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine();      }
458                 m->mothurOutEndLine();
459                 return 0;
460                 
461         }
462                 
463                 //****************** break up files between processes and cluster each file set ******************************//
464         #ifdef USE_MPI
465                         ////you are process 0 from above////
466                         
467                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...                           
468                         dividedNames.resize(processors);
469                                         
470                         //for each file group figure out which process will complete it
471                         //want to divide the load intelligently so the big files are spread between processes
472                         for (int i = 0; i < distName.size(); i++) { 
473                                 int processToAssign = (i+1) % processors; 
474                                 if (processToAssign == 0) { processToAssign = processors; }
475                                                 
476                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
477                         }
478                                         
479                         //not lets reverse the order of ever other process, so we balance big files running with little ones
480                         for (int i = 0; i < processors; i++) {
481                                 int remainder = ((i+1) % processors);
482                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
483                         }
484                         
485                         
486                         //send each child the list of files it needs to process
487                         for(int i = 1; i < processors; i++) { 
488                                 //send number of file pairs
489                                 int num = dividedNames[i].size();
490                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
491                                 
492                                 for (int j = 0; j < num; j++) { //send filenames to process i
493                                         char tempDistFileName[1024];
494                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
495                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
496                                         
497                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
498                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
499                                         
500                                         char tempNameFileName[1024];
501                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
502                                         int lengthName = (dividedNames[i][j].begin()->second).length();
503
504                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
505                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
506                                 }
507                         }
508                         
509                         //process your share
510                         listFileNames = cluster(dividedNames[0], labels);
511                         
512                         //receive the other processes info
513                         for(int i = 1; i < processors; i++) { 
514                                 int num = dividedNames[i].size();
515                                 
516                                 double tempCutoff;
517                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
518                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
519                                 
520                                 //send list filenames to root process
521                                 for (int j = 0; j < num; j++) {  
522                                         int lengthList = 0;
523                                         char tempListFileName[1024];
524                                 
525                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
526                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
527                                 
528                                         string myListFileName = tempListFileName;
529                                         myListFileName = myListFileName.substr(0, lengthList);
530                                         
531                                         listFileNames.push_back(myListFileName);
532                                 }
533                                 
534                                 //send Labels to root process
535                                 int numLabels = 0;
536                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
537                                 
538                                 for (int j = 0; j < numLabels; j++) {  
539                                         int lengthLabel = 0;
540                                         char tempLabel[100];
541                                 
542                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
543                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
544                                 
545                                         string myLabel = tempLabel;
546                                         myLabel = myLabel.substr(0, lengthLabel);
547                         
548                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
549                                 }
550                         }
551                         
552                 }else { //you are a child process
553                         vector < map<string, string> >  myNames;
554                         
555                         //recieve the files you need to process
556                         //receive number of file pairs
557                         int num = 0;
558                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
559                         
560                         myNames.resize(num);
561         
562                         for (int j = 0; j < num; j++) { //receive filenames to process 
563                                 int lengthDist = 0;
564                                 char tempDistFileName[1024];
565                                 
566                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
567                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
568                                 
569                                 string myDistFileName = tempDistFileName;
570                                 myDistFileName = myDistFileName.substr(0, lengthDist);
571                         
572                                 int lengthName = 0;
573                                 char tempNameFileName[1024];
574                                 
575                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
576                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
577                                 
578                                 string myNameFileName = tempNameFileName;
579                                 myNameFileName = myNameFileName.substr(0, lengthName);
580                                 
581                                 //save file name
582                                 myNames[j][myDistFileName] = myNameFileName;
583                         }
584         
585                         //process them
586                         listFileNames = cluster(myNames, labels);
587                         
588                         //send cutoff
589                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
590                         
591                         //send list filenames to root process
592                         for (int j = 0; j < num; j++) {  
593                                 char tempListFileName[1024];
594                                 strcpy(tempListFileName, listFileNames[j].c_str());
595                                 int lengthList = listFileNames[j].length();
596                                         
597                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
598                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
599                         }
600                         
601                         //send Labels to root process
602                         int numLabels = labels.size();
603                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
604                         
605                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
606                                 char tempLabel[100];
607                                 strcpy(tempLabel, (*it).c_str());
608                                 int lengthLabel = (*it).length();
609                                         
610                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
611                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
612                         }
613                 }
614                 
615                 //make everyone wait
616                 MPI_Barrier(MPI_COMM_WORLD);
617                 
618         #else
619                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
620                 //sanity check
621                 if (processors > distName.size()) { processors = distName.size(); }
622                 
623                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
624                                 if(processors == 1){
625                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
626                                 }else{
627                                         listFileNames = createProcesses(distName, labels);
628                 }
629                 #else
630                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
631                 #endif
632         #endif  
633                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
634                 
635                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
636                 
637                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
638                 
639                 //****************** merge list file and create rabund and sabund files ******************************//
640                 estart = time(NULL);
641                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
642                 
643                 #ifdef USE_MPI
644                         if (pid == 0) { //only process 0 merges
645                 #endif
646
647                 ListVector* listSingle;
648                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
649                 
650                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
651                 
652                 mergeLists(listFileNames, labelBins, listSingle);
653
654                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
655                 
656                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
657                 
658                 //set list file as new current listfile
659                 string current = "";
660                 itTypes = outputTypes.find("list");
661                 if (itTypes != outputTypes.end()) {
662                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
663                 }
664                 
665                 //set rabund file as new current rabundfile
666                 itTypes = outputTypes.find("rabund");
667                 if (itTypes != outputTypes.end()) {
668                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
669                 }
670                 
671                 //set sabund file as new current sabundfile
672                 itTypes = outputTypes.find("sabund");
673                 if (itTypes != outputTypes.end()) {
674                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
675                 }
676                                 
677                 m->mothurOutEndLine();
678                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
679                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
680                 m->mothurOutEndLine();
681                 
682                 #ifdef USE_MPI
683                         } //only process 0 merges
684                         
685                         //make everyone wait
686                         MPI_Barrier(MPI_COMM_WORLD);
687                 #endif
688
689                 return 0;
690         }
691         catch(exception& e) {
692                 m->errorOut(e, "ClusterSplitCommand", "execute");
693                 exit(1);
694         }
695 }
696 //**********************************************************************************************************************
697 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
698         try {
699                                 
700                 map<float, int> labelBin;
701                 vector<float> orderFloat;
702                 int numSingleBins;
703                 
704                 //read in singletons
705                 if (singleton != "none") {
706             
707             ifstream in;
708             m->openInputFile(singleton, in);
709                                 
710                         string firstCol, secondCol;
711                         listSingle = new ListVector();
712             
713             if (countfile != "") { m->getline(in); m->gobble(in); }
714             
715                         while (!in.eof()) {
716                                 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
717                                 if (countfile == "") { listSingle->push_back(secondCol); }
718                 else { listSingle->push_back(firstCol); }
719                         }
720             
721                         in.close();
722                         m->mothurRemove(singleton);
723                         
724                         numSingleBins = listSingle->getNumBins();
725                 }else{  listSingle = NULL; numSingleBins = 0;  }
726                 
727                 //go through users set and make them floats so we can sort them 
728                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
729                         float temp = -10.0;
730
731                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
732                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
733                         
734                         if (temp <= cutoff) {
735                                 orderFloat.push_back(temp);
736                                 labelBin[temp] = numSingleBins; //initialize numbins 
737                         }
738                 }
739         
740                 //sort order
741                 sort(orderFloat.begin(), orderFloat.end());
742                 userLabels.clear();
743                         
744                 //get the list info from each file
745                 for (int k = 0; k < listNames.size(); k++) {
746         
747                         if (m->control_pressed) {  
748                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
749                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
750                                 return labelBin;
751                         }
752                         
753                         InputData* input = new InputData(listNames[k], "list");
754                         ListVector* list = input->getListVector();
755                         string lastLabel = list->getLabel();
756                         
757                         string filledInList = listNames[k] + "filledInTemp";
758                         ofstream outFilled;
759                         m->openOutputFile(filledInList, outFilled);
760         
761                         //for each label needed
762                         for(int l = 0; l < orderFloat.size(); l++){
763                         
764                                 string thisLabel;
765                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
766                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
767
768                                 //this file has reached the end
769                                 if (list == NULL) { 
770                                         list = input->getListVector(lastLabel, true); 
771                                 }else{  //do you have the distance, or do you need to fill in
772                                                 
773                                         float labelFloat;
774                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
775                                         else { convert(list->getLabel(), labelFloat); }
776
777                                         //check for missing labels
778                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
779                                                 //if its bigger get last label, otherwise keep it
780                                                 delete list;
781                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
782                                         }
783                                         lastLabel = list->getLabel();
784                                 }
785                                 
786                                 //print to new file
787                                 list->setLabel(thisLabel);
788                                 list->print(outFilled);
789                 
790                                 //update labelBin
791                                 labelBin[orderFloat[l]] += list->getNumBins();
792                                                                         
793                                 delete list;
794                                                                         
795                                 list = input->getListVector();
796                         }
797                         
798                         if (list != NULL) { delete list; }
799                         delete input;
800                         
801                         outFilled.close();
802                         m->mothurRemove(listNames[k]);
803                         rename(filledInList.c_str(), listNames[k].c_str());
804                 }
805                 
806                 return labelBin;
807         }
808         catch(exception& e) {
809                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
810                 exit(1);
811         }
812 }
813 //**********************************************************************************************************************
814 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
815         try {
816                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
817                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
818                 
819         map<string, string> variables; 
820         variables["[filename]"] = fileroot;
821         if (countfile != "") { variables["[tag2]"] = "unique_list"; }
822         variables["[clustertag]"] = tag;
823         string sabundFileName = getOutputFileName("sabund", variables);
824         string rabundFileName = getOutputFileName("rabund", variables);
825         string listFileName = getOutputFileName("list", variables);
826         
827         if (countfile == "") {
828             m->openOutputFile(sabundFileName,   outSabund);
829             m->openOutputFile(rabundFileName,   outRabund);
830             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
831             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
832             
833         }
834                 m->openOutputFile(listFileName, outList);
835         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
836                 
837                 
838                 map<float, int>::iterator itLabel;
839
840                 //for each label needed
841                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
842                         
843                         string thisLabel;
844                         if (itLabel->first == -1) { thisLabel = "unique"; }
845                         else { thisLabel = toString(itLabel->first,  length-1);  } 
846                         
847                         outList << thisLabel << '\t' << itLabel->second << '\t';
848             
849             RAbundVector* rabund = NULL;
850             if (countfile == "") {
851                 rabund = new RAbundVector();
852                 rabund->setLabel(thisLabel);
853             }
854
855                         //add in singletons
856                         if (listSingle != NULL) {
857                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
858                                         outList << listSingle->get(j) << '\t';
859                                         if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
860                                 }
861                         }
862                         
863                         //get the list info from each file
864                         for (int k = 0; k < listNames.size(); k++) {
865         
866                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } if (rabund != NULL) { delete rabund; } return 0; }
867                                 
868                                 InputData* input = new InputData(listNames[k], "list");
869                                 ListVector* list = input->getListVector(thisLabel);
870                                 
871                                 //this file has reached the end
872                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
873                                 else {          
874                                         for (int j = 0; j < list->getNumBins(); j++) {
875                                                 outList << list->get(j) << '\t';
876                                                 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
877                                         }
878                                         delete list;
879                                 }
880                                 delete input;
881                         }
882                         
883             if (countfile == "") {
884                 SAbundVector sabund = rabund->getSAbundVector();
885                 sabund.print(outSabund);
886                 rabund->print(outRabund);
887             }
888                         outList << endl;
889                         
890                         if (rabund != NULL) { delete rabund; }
891                 }
892                 
893                 outList.close();
894         if (countfile == "") {
895             outRabund.close();
896             outSabund.close();
897                 }
898                 if (listSingle != NULL) { delete listSingle;  }
899                 
900                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
901                 
902                 return 0;
903         }
904         catch(exception& e) {
905                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
906                 exit(1);
907         }
908 }
909
910 //**********************************************************************************************************************
911
912 void ClusterSplitCommand::printData(ListVector* oldList){
913         try {
914                 string label = oldList->getLabel();
915                 RAbundVector oldRAbund = oldList->getRAbundVector();
916                 
917                 oldRAbund.setLabel(label);
918                 if (m->isTrue(showabund)) {
919                         oldRAbund.getSAbundVector().print(cout);
920                 }
921                 oldRAbund.print(outRabund);
922                 oldRAbund.getSAbundVector().print(outSabund);
923         
924                 oldList->print(outList);
925         }
926         catch(exception& e) {
927                 m->errorOut(e, "ClusterSplitCommand", "printData");
928                 exit(1);
929         }
930 }
931 //**********************************************************************************************************************
932 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
933         try {
934         
935         vector<string> listFiles;
936         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
937         dividedNames.resize(processors);
938         
939         //for each file group figure out which process will complete it
940         //want to divide the load intelligently so the big files are spread between processes
941         for (int i = 0; i < distName.size(); i++) { 
942             //cout << i << endl;
943             int processToAssign = (i+1) % processors; 
944             if (processToAssign == 0) { processToAssign = processors; }
945             
946             dividedNames[(processToAssign-1)].push_back(distName[i]);
947             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
948         }
949         
950         //not lets reverse the order of ever other process, so we balance big files running with little ones
951         for (int i = 0; i < processors; i++) {
952             //cout << i << endl;
953             int remainder = ((i+1) % processors);
954             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
955         }
956         
957         if (m->control_pressed) { return listFiles; }
958         
959         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
960                 int process = 1;
961                 processIDS.clear();
962                 
963                 //loop through and create all the processes you want
964                 while (process != processors) {
965                         int pid = fork();
966                         
967                         if (pid > 0) {
968                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
969                                 process++;
970                         }else if (pid == 0){
971                                 set<string> labels;
972                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
973                                 
974                                 //write out names to file
975                                 string filename = toString(getpid()) + ".temp";
976                                 ofstream out;
977                                 m->openOutputFile(filename, out);
978                                 out << tag << endl;
979                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
980                                 out.close();
981                                 
982                                 //print out labels
983                                 ofstream outLabels;
984                                 filename = toString(getpid()) + ".temp.labels";
985                                 m->openOutputFile(filename, outLabels);
986                                 
987                                 outLabels << cutoff << endl;
988                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
989                                         outLabels << (*it) << endl;
990                                 }
991                                 outLabels.close();
992
993                                 exit(0);
994                         }else { 
995                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
996                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
997                                 exit(0);
998                         }
999                 }
1000                 
1001         //do your part
1002         listFiles = cluster(dividedNames[0], labels);
1003         
1004                 //force parent to wait until all the processes are done
1005                 for (int i=0;i< processIDS.size();i++) { 
1006                         int temp = processIDS[i];
1007                         wait(&temp);
1008                 }
1009         
1010         //get list of list file names from each process
1011         for(int i=0;i<processIDS.size();i++){
1012             string filename = toString(processIDS[i]) + ".temp";
1013             ifstream in;
1014             m->openInputFile(filename, in);
1015             
1016             in >> tag; m->gobble(in);
1017             
1018             while(!in.eof()) {
1019                 string tempName;
1020                 in >> tempName; m->gobble(in);
1021                 listFiles.push_back(tempName);
1022             }
1023             in.close();
1024             m->mothurRemove((toString(processIDS[i]) + ".temp"));
1025             
1026             //get labels
1027             filename = toString(processIDS[i]) + ".temp.labels";
1028             ifstream in2;
1029             m->openInputFile(filename, in2);
1030             
1031             float tempCutoff;
1032             in2 >> tempCutoff; m->gobble(in2);
1033             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1034             
1035             while(!in2.eof()) {
1036                 string tempName;
1037                 in2 >> tempName; m->gobble(in2);
1038                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1039             }
1040             in2.close();
1041             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1042         }
1043         
1044
1045     #else
1046        
1047         //////////////////////////////////////////////////////////////////////////////////////////////////////
1048                 //Windows version shared memory, so be careful when passing variables through the clusterData struct. 
1049                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1050                 //Taking advantage of shared memory to allow both threads to add labels.
1051                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1052                 /*
1053                 vector<clusterData*> pDataArray; 
1054                 DWORD   dwThreadIdArray[processors-1];
1055                 HANDLE  hThreadArray[processors-1]; 
1056                 
1057                 //Create processor worker threads.
1058                 for( int i=1; i<processors; i++ ){
1059                         // Allocate memory for thread data.
1060                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1061                         pDataArray.push_back(tempCluster);
1062                         processIDS.push_back(i);
1063             
1064                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1065                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1066                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);  
1067             
1068                 }
1069         
1070         //do your part
1071         listFiles = cluster(dividedNames[0], labels);
1072         
1073                 //Wait until all threads have terminated.
1074                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1075                 
1076                 //Close all thread handles and free memory allocations.
1077                 for(int i=0; i < pDataArray.size(); i++){
1078             //get tag
1079             tag = pDataArray[i]->tag;
1080             //get listfiles created
1081             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1082             //get labels
1083             set<string>::iterator it;
1084             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1085                         //check cutoff
1086             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1087                         CloseHandle(hThreadArray[i]);
1088                         delete pDataArray[i];
1089                 }
1090 */
1091         #endif          
1092         
1093         return listFiles;
1094         
1095         }
1096         catch(exception& e) {
1097                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1098                 exit(1);
1099         }
1100 }
1101 //**********************************************************************************************************************
1102
1103 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1104         try {
1105                 
1106                 vector<string> listFileNames;
1107                 double smallestCutoff = cutoff;
1108                 
1109                 //cluster each distance file
1110                 for (int i = 0; i < distNames.size(); i++) {
1111             
1112                         string thisNamefile = distNames[i].begin()->second;
1113                         string thisDistFile = distNames[i].begin()->first;
1114                         
1115                         string listFileName = "";
1116             if (classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff);   }
1117             else            {  listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff);          }
1118
1119                         if (m->control_pressed) { //clean up
1120                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1121                                 listFileNames.clear(); return listFileNames;
1122                         }
1123             
1124             listFileNames.push_back(listFileName);
1125         }
1126                 
1127                 cutoff = smallestCutoff;
1128                                         
1129                 return listFileNames;
1130         
1131         }
1132         catch(exception& e) {
1133                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1134                 exit(1);
1135         }
1136
1137
1138 }
1139 //**********************************************************************************************************************
1140 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1141         try {
1142         string listFileName = "";
1143         
1144         ListVector* list = NULL;
1145         ListVector oldList;
1146         RAbundVector* rabund = NULL;
1147         
1148 #ifdef USE_MPI
1149         int pid;
1150         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1151         
1152         //output your files too
1153         if (pid != 0) {
1154             cout << endl << "Reading " << thisDistFile << endl;
1155         }
1156 #endif
1157
1158         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1159         
1160         //reads phylip file storing data in 2D vector, also fills list and rabund
1161         bool sim = false;
1162                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1163         
1164         NameAssignment* nameMap = NULL;
1165         CountTable* ct = NULL;
1166         if(namefile != ""){     
1167                         nameMap = new NameAssignment(thisNamefile);
1168                         nameMap->readMap();
1169             cluster->readPhylipFile(thisDistFile, nameMap);
1170                 }else if (countfile != "") {
1171             ct = new CountTable();
1172             ct->readTable(thisNamefile);
1173             cluster->readPhylipFile(thisDistFile, ct);
1174         }
1175         tag = cluster->getTag();
1176         
1177                 if (m->control_pressed) { if(namefile != ""){   delete nameMap; }
1178             else { delete ct; } delete cluster; return 0; }
1179                 
1180                 list = cluster->getListVector();
1181                 rabund = cluster->getRAbundVector();
1182         
1183                 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1184                 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1185         listFileName = fileroot+ tag + ".list";
1186         
1187         ofstream listFile;
1188                 m->openOutputFile(fileroot+ tag + ".list",      listFile);
1189                 
1190                 float previousDist = 0.00000;
1191                 float rndPreviousDist = 0.00000;
1192                 oldList = *list;
1193                 
1194 #ifdef USE_MPI
1195         //output your files too
1196         if (pid != 0) {
1197             cout << endl << "Clustering " << thisDistFile << endl;
1198         }
1199 #endif
1200
1201         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1202         
1203                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1204                         if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  if(namefile != ""){    delete nameMap; }
1205                 else { delete ct; } return listFileName;  }
1206             
1207                         cluster->update(cutoff);
1208             
1209                         float dist = cluster->getSmallDist();
1210                         float rndDist;
1211                         if (hard) {
1212                                 rndDist = m->ceilDist(dist, precision); 
1213                         }else{
1214                                 rndDist = m->roundDist(dist, precision); 
1215                         }
1216             
1217             if(previousDist <= 0.0000 && dist != previousDist){
1218                 oldList.setLabel("unique");
1219                 oldList.print(listFile);
1220                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1221             }
1222             else if(rndDist != rndPreviousDist){
1223                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1224                 oldList.print(listFile);
1225                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1226             }
1227
1228             
1229                         previousDist = dist;
1230                         rndPreviousDist = rndDist;
1231                         oldList = *list;
1232                 }
1233         
1234                 if(previousDist <= 0.0000){
1235             oldList.setLabel("unique");
1236             oldList.print(listFile);
1237             if (labels.count("unique") == 0) { labels.insert("unique"); }
1238         }
1239         else if(rndPreviousDist<cutoff){
1240             oldList.setLabel(toString(rndPreviousDist,  length-1));
1241             oldList.print(listFile);
1242             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1243         }
1244
1245         
1246                 listFile.close();
1247                 
1248                 delete cluster;  delete list; delete rabund;
1249         if(namefile != ""){     delete nameMap; }
1250         else { delete ct; }
1251         
1252         m->mothurRemove(thisDistFile);
1253         m->mothurRemove(thisNamefile);
1254         
1255         return listFileName;
1256         
1257         }
1258         catch(exception& e) {
1259                 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1260                 exit(1);
1261         }
1262 }
1263
1264 //**********************************************************************************************************************
1265 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1266         try {
1267         string listFileName = "";
1268         
1269         Cluster* cluster = NULL;
1270         SparseDistanceMatrix* matrix = NULL;
1271         ListVector* list = NULL;
1272         ListVector oldList;
1273         RAbundVector* rabund = NULL;
1274         
1275         if (m->control_pressed) { return listFileName; }
1276         
1277 #ifdef USE_MPI
1278         int pid;
1279         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1280         
1281         //output your files too
1282         if (pid != 0) {
1283             cout << endl << "Reading " << thisDistFile << endl;
1284         }
1285 #endif
1286         
1287         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1288         
1289         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
1290         read->setCutoff(cutoff);
1291         
1292         NameAssignment* nameMap = NULL;
1293         CountTable* ct = NULL;
1294                 if(namefile != ""){     
1295                         nameMap = new NameAssignment(thisNamefile);
1296                         nameMap->readMap();
1297             read->read(nameMap);
1298                 }else if (countfile != "") {
1299             ct = new CountTable();
1300             ct->readTable(thisNamefile);
1301             read->read(ct);
1302         }else { read->read(nameMap); }
1303                 
1304                 list = read->getListVector();
1305         oldList = *list;
1306                 matrix = read->getDMatrix();
1307         
1308                 if(countfile != "") {
1309             rabund = new RAbundVector();
1310             createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1311             delete ct;
1312         }else { rabund = new RAbundVector(list->getRAbundVector()); }
1313
1314         delete read;  read = NULL;
1315         if (namefile != "") { delete nameMap; nameMap = NULL; }
1316         
1317         
1318 #ifdef USE_MPI
1319         //output your files too
1320         if (pid != 0) {
1321             cout << endl << "Clustering " << thisDistFile << endl;
1322         }
1323 #endif
1324         
1325         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1326                 
1327         //create cluster
1328         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1329         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1330         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
1331         tag = cluster->getTag();
1332                 
1333         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1334         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1335         
1336         ofstream listFile;
1337         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1338                 
1339         listFileName = fileroot+ tag + ".list";
1340         
1341         float previousDist = 0.00000;
1342         float rndPreviousDist = 0.00000;
1343         
1344         oldList = *list;
1345         
1346         print_start = true;
1347         start = time(NULL);
1348         double saveCutoff = cutoff;
1349                 
1350         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1351             
1352             if (m->control_pressed) { //clean up
1353                 delete matrix; delete list;     delete cluster; delete rabund;
1354                 listFile.close();
1355                 m->mothurRemove(listFileName);  
1356                 return listFileName;
1357             }
1358             
1359             cluster->update(saveCutoff);
1360             
1361             float dist = matrix->getSmallDist();
1362             float rndDist;
1363             if (hard) {
1364                 rndDist = m->ceilDist(dist, precision); 
1365             }else{
1366                 rndDist = m->roundDist(dist, precision); 
1367             }
1368             
1369             if(previousDist <= 0.0000 && dist != previousDist){
1370                 oldList.setLabel("unique");
1371                 oldList.print(listFile);
1372                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1373             }
1374             else if(rndDist != rndPreviousDist){
1375                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1376                 oldList.print(listFile);
1377                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1378             }
1379             
1380             previousDist = dist;
1381             rndPreviousDist = rndDist;
1382             oldList = *list;
1383         }
1384         
1385                 
1386         if(previousDist <= 0.0000){
1387             oldList.setLabel("unique");
1388             oldList.print(listFile);
1389             if (labels.count("unique") == 0) { labels.insert("unique"); }
1390         }
1391         else if(rndPreviousDist<cutoff){
1392             oldList.setLabel(toString(rndPreviousDist,  length-1));
1393             oldList.print(listFile);
1394             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1395         }
1396         
1397         delete matrix; delete list;     delete cluster; delete rabund; 
1398         matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1399         listFile.close();
1400         
1401         if (m->control_pressed) { //clean up
1402             m->mothurRemove(listFileName);      
1403             return listFileName;
1404         }
1405         
1406         m->mothurRemove(thisDistFile);
1407         m->mothurRemove(thisNamefile);
1408         
1409         if (saveCutoff != cutoff) { 
1410             if (hard)   {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1411             else                {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1412                         
1413             m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  
1414         }
1415         
1416         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1417         
1418         return listFileName;
1419         
1420         }
1421         catch(exception& e) {
1422                 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1423                 exit(1);
1424         }
1425 }
1426 //**********************************************************************************************************************
1427
1428 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1429         try{
1430                 
1431 #ifdef USE_MPI
1432                 int pid;
1433                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1434                 
1435                 if (pid != 0) {
1436 #endif
1437                 
1438                 string thisOutputDir = outputDir;
1439                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1440         map<string, string> variables; 
1441         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
1442                 string outputFileName = getOutputFileName("column", variables);
1443                 m->mothurRemove(outputFileName);
1444                 
1445                 
1446                 for (int i = 0; i < distNames.size(); i++) {
1447                         if (m->control_pressed) {  return 0; }
1448                         
1449                         string thisDistFile = distNames[i].begin()->first;
1450                         
1451                         m->appendFiles(thisDistFile, outputFileName);
1452                 }       
1453                         
1454                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1455                         
1456 #ifdef USE_MPI
1457                 }
1458 #endif
1459                                 
1460                 return 0;       
1461                 
1462                 
1463         }
1464         catch(exception& e) {
1465                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1466                 exit(1);
1467         }
1468 }
1469 //**********************************************************************************************************************
1470 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1471     try {
1472         rabund->setLabel(list->getLabel());        
1473         for(int i = 0; i < list->getNumBins(); i++) { 
1474             if (m->control_pressed) { break; }
1475             vector<string> binNames;
1476             string bin = list->get(i);
1477             m->splitAtComma(bin, binNames);
1478             int total = 0;
1479             for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
1480             rabund->push_back(total);   
1481         }
1482         return 0;
1483     }
1484     catch(exception& e) {
1485                 m->errorOut(e, "ClusterCommand", "createRabund");
1486                 exit(1);
1487         }
1488     
1489 }
1490 //**********************************************************************************************************************