]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
adding labels to list file.
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11
12
13 //**********************************************************************************************************************
14 vector<string> ClusterSplitCommand::setParameters(){    
15         try {
16                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
17                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
18                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
19                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
20         CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
21                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
22                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
23                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "","",false,false,true); parameters.push_back(psplitmethod);
24                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
25                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
26         CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
27                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
28                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
29                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff);
30                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
31                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false); parameters.push_back(pmethod);
32                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
33         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
34                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
35                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
36                         
37                 vector<string> myArray;
38                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
39                 return myArray;
40         }
41         catch(exception& e) {
42                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
43                 exit(1);
44         }
45 }
46 //**********************************************************************************************************************
47 string ClusterSplitCommand::getHelpString(){    
48         try {
49                 string helpString = "";
50                 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
51                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
52                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
53                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
54                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
55                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
56                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
57                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
58                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
59                 helpString += "The name parameter allows you to enter your name file. \n";
60         helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
61         helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
62                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
63                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
64                 helpString += "The method allows you to specify what clustering algorithm you want to use, default=average, option furthest, nearest, or average. \n";
65                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
66                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
67                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
68                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
69         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic.  It is only valid with splitmethod=fasta. Default=f.\n";
70 #ifdef USE_MPI
71                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
72 #endif
73                 helpString += "The cluster.split command should be in the following format: \n";
74                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
75                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";       
76                 return helpString;
77         }
78         catch(exception& e) {
79                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
80                 exit(1);
81         }
82 }
83 //**********************************************************************************************************************
84 string ClusterSplitCommand::getOutputPattern(string type) {
85     try {
86         string pattern = "";
87         
88         if (type == "list") {  pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; } 
89         else if (type == "rabund") {  pattern = "[filename],[clustertag],rabund"; } 
90         else if (type == "sabund") {  pattern = "[filename],[clustertag],sabund"; }
91         else if (type == "column") {  pattern = "[filename],dist"; }
92         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
93         
94         return pattern;
95     }
96     catch(exception& e) {
97         m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
98         exit(1);
99     }
100 }
101 //**********************************************************************************************************************
102 ClusterSplitCommand::ClusterSplitCommand(){     
103         try {
104                 abort = true; calledHelp = true; 
105                 setParameters();
106                 vector<string> tempOutNames;
107                 outputTypes["list"] = tempOutNames;
108                 outputTypes["rabund"] = tempOutNames;
109                 outputTypes["sabund"] = tempOutNames;
110                 outputTypes["column"] = tempOutNames;
111         }
112         catch(exception& e) {
113                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
114                 exit(1);
115         }
116 }
117 //**********************************************************************************************************************
118 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
119 ClusterSplitCommand::ClusterSplitCommand(string option)  {
120         try{
121                 abort = false; calledHelp = false;   
122                 format = "";
123                 
124                 //allow user to run help
125                 if(option == "help") { help(); abort = true; calledHelp = true; }
126                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
127                 
128                 else {
129                         vector<string> myArray = setParameters();
130                         
131                         OptionParser parser(option);
132                         map<string,string> parameters = parser.getParameters();
133                         
134                         ValidParameters validParameter("cluster.split");
135                 
136                         //check to make sure all parameters are valid for command
137                         map<string,string>::iterator it;
138                         for (it = parameters.begin(); it != parameters.end(); it++) { 
139                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
140                                         abort = true;
141                                 }
142                         }
143                         
144                         //initialize outputTypes
145                         vector<string> tempOutNames;
146                         outputTypes["list"] = tempOutNames;
147                         outputTypes["rabund"] = tempOutNames;
148                         outputTypes["sabund"] = tempOutNames;
149                         outputTypes["column"] = tempOutNames;
150                         
151                         //if the user changes the output directory command factory will send this info to us in the output parameter 
152                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
153                         
154                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
155                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
156                         if (inputDir == "not found"){   inputDir = "";          }
157                         else {
158                                 string path;
159                                 it = parameters.find("phylip");
160                                 //user has given a template file
161                                 if(it != parameters.end()){ 
162                                         path = m->hasPath(it->second);
163                                         //if the user has not given a path then, add inputdir. else leave path alone.
164                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
165                                 }
166                                 
167                                 it = parameters.find("column");
168                                 //user has given a template file
169                                 if(it != parameters.end()){ 
170                                         path = m->hasPath(it->second);
171                                         //if the user has not given a path then, add inputdir. else leave path alone.
172                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
173                                 }
174                                 
175                                 it = parameters.find("name");
176                                 //user has given a template file
177                                 if(it != parameters.end()){ 
178                                         path = m->hasPath(it->second);
179                                         //if the user has not given a path then, add inputdir. else leave path alone.
180                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
181                                 }
182                                 
183                                 it = parameters.find("taxonomy");
184                                 //user has given a template file
185                                 if(it != parameters.end()){ 
186                                         path = m->hasPath(it->second);
187                                         //if the user has not given a path then, add inputdir. else leave path alone.
188                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
189                                 }
190                                 
191                                 it = parameters.find("fasta");
192                                 //user has given a template file
193                                 if(it != parameters.end()){ 
194                                         path = m->hasPath(it->second);
195                                         //if the user has not given a path then, add inputdir. else leave path alone.
196                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
197                                 }
198                 
199                 it = parameters.find("count");
200                                 //user has given a template file
201                                 if(it != parameters.end()){ 
202                                         path = m->hasPath(it->second);
203                                         //if the user has not given a path then, add inputdir. else leave path alone.
204                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
205                                 }
206                         }
207                         
208                         //check for required parameters
209                         phylipfile = validParameter.validFile(parameters, "phylip", true);
210                         if (phylipfile == "not open") { abort = true; }
211                         else if (phylipfile == "not found") { phylipfile = ""; }        
212                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
213                         
214                         columnfile = validParameter.validFile(parameters, "column", true);
215                         if (columnfile == "not open") { abort = true; } 
216                         else if (columnfile == "not found") { columnfile = ""; }
217                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
218                         
219                         namefile = validParameter.validFile(parameters, "name", true);
220                         if (namefile == "not open") { abort = true; namefile = "";}     
221                         else if (namefile == "not found") { namefile = "";  }
222                         else { m->setNameFile(namefile); }
223             
224             countfile = validParameter.validFile(parameters, "count", true);
225                         if (countfile == "not open") { abort = true; countfile = "";}   
226                         else if (countfile == "not found") { countfile = "";  }
227                         else { m->setCountTableFile(countfile); }
228                         
229                         fastafile = validParameter.validFile(parameters, "fasta", true);
230                         if (fastafile == "not open") { abort = true; }  
231                         else if (fastafile == "not found") { fastafile = ""; }
232                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
233                         
234                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
235                         if (taxFile == "not open") { taxFile = ""; abort = true; }      
236                         else if (taxFile == "not found") { taxFile = ""; }
237                         else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
238                         
239                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { 
240                                 //is there are current file available for either of these?
241                                 //give priority to column, then phylip, then fasta
242                                 columnfile = m->getColumnFile(); 
243                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
244                                 else { 
245                                         phylipfile = m->getPhylipFile(); 
246                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
247                                         else { 
248                                                 fastafile = m->getFastaFile(); 
249                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
250                                                 else { 
251                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); 
252                                                         abort = true; 
253                                                 }
254                                         }
255                                 }
256                         }
257                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
258             
259             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
260             
261                         if (columnfile != "") {
262                                 if ((namefile == "") && (countfile == "")) { 
263                                         namefile = m->getNameFile(); 
264                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
265                                         else { 
266                                                 countfile = m->getCountTableFile();
267                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
268                         else { 
269                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine(); 
270                             abort = true; 
271                         }       
272                                         }       
273                                 }
274                         }
275                         
276                         if (fastafile != "") {
277                                 if (taxFile == "") { 
278                                         taxFile = m->getTaxonomyFile(); 
279                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
280                                         else { 
281                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine(); 
282                                                 abort = true; 
283                                         }       
284                                 }
285                                 
286                                 if ((namefile == "") && (countfile == "")) { 
287                                         namefile = m->getNameFile(); 
288                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
289                                         else { 
290                                                 countfile = m->getCountTableFile();
291                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
292                         else { 
293                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine(); 
294                             abort = true; 
295                         }       
296                                         }       
297                                 }
298                         }
299                                         
300                         //check for optional parameter and set defaults
301                         // ...at some point should added some additional type checking...
302                         //get user cutoff and precision or use defaults
303                         string temp;
304                         temp = validParameter.validFile(parameters, "precision", false);
305                         if (temp == "not found") { temp = "100"; }
306                         //saves precision legnth for formatting below
307                         length = temp.length();
308                         m->mothurConvert(temp, precision); 
309                         
310                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
311                         hard = m->isTrue(temp);
312                         
313                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
314                         large = m->isTrue(temp);
315             
316                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
317                         m->setProcessors(temp);
318                         m->mothurConvert(temp, processors);
319                         
320                         temp = validParameter.validFile(parameters, "splitmethod", false);      
321                         if ((splitmethod != "fasta") && (splitmethod != "classify")) {
322                                 if (temp == "not found")  { splitmethod = "distance"; }
323                                 else {  splitmethod = temp; }
324                         }
325                         
326             temp = validParameter.validFile(parameters, "classic", false);                      if (temp == "not found") { temp = "F"; }
327                         classic = m->isTrue(temp);
328             
329             if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
330
331                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
332                         m->mothurConvert(temp, cutoff); 
333                         cutoff += (5 / (precision * 10.0));  
334                         
335                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
336                         m->mothurConvert(temp, taxLevelCutoff); 
337                         
338                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
339                         
340                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
341                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
342                         
343                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
344                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
345                         
346                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
347
348                         showabund = validParameter.validFile(parameters, "showabund", false);
349                         if (showabund == "not found") { showabund = "T"; }
350             
351             temp = validParameter.validFile(parameters, "cluster", false);  if (temp == "not found") { temp = "T"; }
352             runCluster = m->isTrue(temp);
353
354                         timing = validParameter.validFile(parameters, "timing", false);
355                         if (timing == "not found") { timing = "F"; }
356                         
357                 }
358         }
359         catch(exception& e) {
360                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
361                 exit(1);
362         }
363 }
364
365 //**********************************************************************************************************************
366
367 int ClusterSplitCommand::execute(){
368         try {
369         
370                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
371                 
372                 time_t estart;
373                 vector<string> listFileNames;
374                 set<string> labels;
375                 string singletonName = "";
376                 double saveCutoff = cutoff;
377
378                 //****************** file prep work ******************************//
379                 #ifdef USE_MPI
380                         int pid;
381                         int tag = 2001;
382                         MPI_Status status; 
383                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
384                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
385                         
386                         if (pid == 0) { //only process 0 converts and splits
387                         
388                 #endif
389                 
390                 //if user gave a phylip file convert to column file
391                 if (format == "phylip") {
392                         estart = time(NULL);
393                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
394                         
395                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
396                         
397                         NameAssignment* nameMap = NULL;
398                         convert->setFormat("phylip");
399                         convert->read(nameMap);
400                         
401                         if (m->control_pressed) {  delete convert;  return 0;  }
402                         
403                         distfile = convert->getOutputFile();
404                 
405                         //if no names file given with phylip file, create it
406                         ListVector* listToMakeNameFile =  convert->getListVector();
407                         if ((namefile == "") && (countfile == "")) {  //you need to make a namefile for split matrix
408                                 ofstream out;
409                                 namefile = phylipfile + ".names";
410                                 m->openOutputFile(namefile, out);
411                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
412                                         string bin = listToMakeNameFile->get(i);
413                                         out << bin << '\t' << bin << endl;
414                                 }
415                                 out.close();
416                         }
417                         delete listToMakeNameFile;
418                         delete convert;
419                         
420                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
421                 }
422                 if (m->control_pressed) { return 0; }
423                 
424                 estart = time(NULL);
425                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
426                 
427                 //split matrix into non-overlapping groups
428                 SplitMatrix* split;
429                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large);                                                    }
430                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large);                                    }
431                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);  }
432                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
433                 
434                 split->split();
435                 
436                 if (m->control_pressed) { delete split; return 0; }
437                 
438                 singletonName = split->getSingletonNames();
439                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
440                 delete split;
441                 
442         if (m->debug) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); }
443                 
444                 //output a merged distance file
445                 //if (splitmethod == "fasta")           { createMergedDistanceFile(distName); }
446                                 
447                 if (m->control_pressed) { return 0; }
448                 
449                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
450                 estart = time(NULL);
451               
452         if (!runCluster) { 
453
454                 m->mothurOutEndLine();
455                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
456                 for (int i = 0; i < distName.size(); i++) {     m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine();      }
457                 m->mothurOutEndLine();
458                 return 0;
459                 
460         }
461    
462                 //****************** break up files between processes and cluster each file set ******************************//
463         #ifdef USE_MPI
464                         ////you are process 0 from above////
465                         
466                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...                           
467                         dividedNames.resize(processors);
468                                         
469                         //for each file group figure out which process will complete it
470                         //want to divide the load intelligently so the big files are spread between processes
471                         for (int i = 0; i < distName.size(); i++) { 
472                                 int processToAssign = (i+1) % processors; 
473                                 if (processToAssign == 0) { processToAssign = processors; }
474                                                 
475                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
476                         }
477                                         
478                         //not lets reverse the order of ever other process, so we balance big files running with little ones
479                         for (int i = 0; i < processors; i++) {
480                                 int remainder = ((i+1) % processors);
481                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
482                         }
483                         
484                         
485                         //send each child the list of files it needs to process
486                         for(int i = 1; i < processors; i++) { 
487                                 //send number of file pairs
488                                 int num = dividedNames[i].size();
489                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
490                                 
491                                 for (int j = 0; j < num; j++) { //send filenames to process i
492                                         char tempDistFileName[1024];
493                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
494                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
495                                         
496                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
497                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
498                                         
499                                         char tempNameFileName[1024];
500                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
501                                         int lengthName = (dividedNames[i][j].begin()->second).length();
502
503                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
504                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
505                                 }
506                         }
507                         
508                         //process your share
509                         listFileNames = cluster(dividedNames[0], labels);
510                         
511                         //receive the other processes info
512                         for(int i = 1; i < processors; i++) { 
513                                 int num = dividedNames[i].size();
514                                 
515                                 double tempCutoff;
516                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
517                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
518                                 
519                                 //send list filenames to root process
520                                 for (int j = 0; j < num; j++) {  
521                                         int lengthList = 0;
522                                         char tempListFileName[1024];
523                                 
524                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
525                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
526                                 
527                                         string myListFileName = tempListFileName;
528                                         myListFileName = myListFileName.substr(0, lengthList);
529                                         
530                                         listFileNames.push_back(myListFileName);
531                                 }
532                                 
533                                 //send Labels to root process
534                                 int numLabels = 0;
535                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
536                                 
537                                 for (int j = 0; j < numLabels; j++) {  
538                                         int lengthLabel = 0;
539                                         char tempLabel[100];
540                                 
541                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
542                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
543                                 
544                                         string myLabel = tempLabel;
545                                         myLabel = myLabel.substr(0, lengthLabel);
546                         
547                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
548                                 }
549                         }
550                         
551                 }else { //you are a child process
552                         vector < map<string, string> >  myNames;
553                         
554                         //recieve the files you need to process
555                         //receive number of file pairs
556                         int num = 0;
557                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
558                         
559                         myNames.resize(num);
560         
561                         for (int j = 0; j < num; j++) { //receive filenames to process 
562                                 int lengthDist = 0;
563                                 char tempDistFileName[1024];
564                                 
565                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
566                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
567                                 
568                                 string myDistFileName = tempDistFileName;
569                                 myDistFileName = myDistFileName.substr(0, lengthDist);
570                         
571                                 int lengthName = 0;
572                                 char tempNameFileName[1024];
573                                 
574                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
575                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
576                                 
577                                 string myNameFileName = tempNameFileName;
578                                 myNameFileName = myNameFileName.substr(0, lengthName);
579                                 
580                                 //save file name
581                                 myNames[j][myDistFileName] = myNameFileName;
582                         }
583         
584                         //process them
585                         listFileNames = cluster(myNames, labels);
586                         
587                         //send cutoff
588                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
589                         
590                         //send list filenames to root process
591                         for (int j = 0; j < num; j++) {  
592                                 char tempListFileName[1024];
593                                 strcpy(tempListFileName, listFileNames[j].c_str());
594                                 int lengthList = listFileNames[j].length();
595                                         
596                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
597                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
598                         }
599                         
600                         //send Labels to root process
601                         int numLabels = labels.size();
602                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
603                         
604                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
605                                 char tempLabel[100];
606                                 strcpy(tempLabel, (*it).c_str());
607                                 int lengthLabel = (*it).length();
608                                         
609                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
610                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
611                         }
612                 }
613                 
614                 //make everyone wait
615                 MPI_Barrier(MPI_COMM_WORLD);
616                 
617         #else
618                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
619                 //sanity check
620                 if (processors > distName.size()) { processors = distName.size(); }
621                 
622                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
623                                 if(processors == 1){
624                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
625                                 }else{
626                                         listFileNames = createProcesses(distName, labels);
627                 }
628                 #else
629                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
630                 #endif
631         #endif  
632                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
633                 
634                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
635                 
636                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
637                 
638                 //****************** merge list file and create rabund and sabund files ******************************//
639                 estart = time(NULL);
640                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
641                 
642                 #ifdef USE_MPI
643                         if (pid == 0) { //only process 0 merges
644                 #endif
645
646                 ListVector* listSingle;
647                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
648                 
649                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
650                 
651                 mergeLists(listFileNames, labelBins, listSingle);
652
653                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
654                 
655                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
656                 
657                 //set list file as new current listfile
658                 string current = "";
659                 itTypes = outputTypes.find("list");
660                 if (itTypes != outputTypes.end()) {
661                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
662                 }
663                 
664                 //set rabund file as new current rabundfile
665                 itTypes = outputTypes.find("rabund");
666                 if (itTypes != outputTypes.end()) {
667                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
668                 }
669                 
670                 //set sabund file as new current sabundfile
671                 itTypes = outputTypes.find("sabund");
672                 if (itTypes != outputTypes.end()) {
673                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
674                 }
675                                 
676                 m->mothurOutEndLine();
677                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
678                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
679                 m->mothurOutEndLine();
680                 
681                 #ifdef USE_MPI
682                         } //only process 0 merges
683                         
684                         //make everyone wait
685                         MPI_Barrier(MPI_COMM_WORLD);
686                 #endif
687
688                 return 0;
689         }
690         catch(exception& e) {
691                 m->errorOut(e, "ClusterSplitCommand", "execute");
692                 exit(1);
693         }
694 }
695 //**********************************************************************************************************************
696 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
697         try {
698                                 
699                 map<float, int> labelBin;
700                 vector<float> orderFloat;
701                 int numSingleBins;
702                 
703                 //read in singletons
704                 if (singleton != "none") {
705             
706             ifstream in;
707             m->openInputFile(singleton, in);
708                                 
709                         string firstCol, secondCol;
710                         listSingle = new ListVector();
711             
712             if (countfile != "") { m->getline(in); m->gobble(in); }
713             
714                         while (!in.eof()) {
715                                 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
716                                 if (countfile == "") { listSingle->push_back(secondCol); }
717                 else { listSingle->push_back(firstCol); }
718                         }
719             
720                         in.close();
721                         m->mothurRemove(singleton);
722                         
723                         numSingleBins = listSingle->getNumBins();
724                 }else{  listSingle = NULL; numSingleBins = 0;  }
725                 
726                 //go through users set and make them floats so we can sort them 
727                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
728                         float temp = -10.0;
729
730                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
731                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
732                         
733                         if (temp <= cutoff) {
734                                 orderFloat.push_back(temp);
735                                 labelBin[temp] = numSingleBins; //initialize numbins 
736                         }
737                 }
738         
739                 //sort order
740                 sort(orderFloat.begin(), orderFloat.end());
741                 userLabels.clear();
742                         
743                 //get the list info from each file
744                 for (int k = 0; k < listNames.size(); k++) {
745         
746                         if (m->control_pressed) {  
747                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
748                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
749                                 return labelBin;
750                         }
751                         
752                         InputData* input = new InputData(listNames[k], "list");
753                         ListVector* list = input->getListVector();
754                         string lastLabel = list->getLabel();
755                         
756                         string filledInList = listNames[k] + "filledInTemp";
757                         ofstream outFilled;
758                         m->openOutputFile(filledInList, outFilled);
759         
760                         //for each label needed
761                         for(int l = 0; l < orderFloat.size(); l++){
762                         
763                                 string thisLabel;
764                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
765                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
766
767                                 //this file has reached the end
768                                 if (list == NULL) { 
769                                         list = input->getListVector(lastLabel, true); 
770                                 }else{  //do you have the distance, or do you need to fill in
771                                                 
772                                         float labelFloat;
773                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
774                                         else { convert(list->getLabel(), labelFloat); }
775
776                                         //check for missing labels
777                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
778                                                 //if its bigger get last label, otherwise keep it
779                                                 delete list;
780                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
781                                         }
782                                         lastLabel = list->getLabel();
783                                 }
784                                 
785                                 //print to new file
786                                 list->setLabel(thisLabel);
787                                 list->print(outFilled);
788                 
789                                 //update labelBin
790                                 labelBin[orderFloat[l]] += list->getNumBins();
791                                                                         
792                                 delete list;
793                                                                         
794                                 list = input->getListVector();
795                         }
796                         
797                         if (list != NULL) { delete list; }
798                         delete input;
799                         
800                         outFilled.close();
801                         m->mothurRemove(listNames[k]);
802                         rename(filledInList.c_str(), listNames[k].c_str());
803                 }
804                 
805                 return labelBin;
806         }
807         catch(exception& e) {
808                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
809                 exit(1);
810         }
811 }
812 //**********************************************************************************************************************
813 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
814         try {
815                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
816                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
817                 
818         map<string, string> variables; 
819         variables["[filename]"] = fileroot;
820         variables["[clustertag]"] = tag;
821         string sabundFileName = getOutputFileName("sabund", variables);
822         string rabundFileName = getOutputFileName("rabund", variables);
823         if (countfile != "") { variables["[tag2]"] = "unique_list"; }
824         string listFileName = getOutputFileName("list", variables);
825         
826         if (countfile == "") {
827             m->openOutputFile(sabundFileName,   outSabund);
828             m->openOutputFile(rabundFileName,   outRabund);
829             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
830             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
831             
832         }
833                 m->openOutputFile(listFileName, outList);
834         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
835                 
836                 map<float, int>::iterator itLabel;
837         
838         //clears out junk for autocompleting of list files above.  Perhaps there is a beter way to handle this from within the data structure?
839         m->printedListHeaders = false;
840
841                 //for each label needed
842                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
843                         
844                         string thisLabel;
845                         if (itLabel->first == -1) { thisLabel = "unique"; }
846                         else { thisLabel = toString(itLabel->first,  length-1);  } 
847                         
848                         //outList << thisLabel << '\t' << itLabel->second << '\t';
849             
850             RAbundVector* rabund = NULL;
851             ListVector completeList;
852             completeList.setLabel(thisLabel);
853             
854             if (countfile == "") {
855                 rabund = new RAbundVector();
856                 rabund->setLabel(thisLabel);
857             }
858
859                         //add in singletons
860                         if (listSingle != NULL) {
861                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
862                                         //outList << listSingle->get(j) << '\t';
863                     completeList.push_back(listSingle->get(j));
864                                         if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
865                                 }
866                         }
867                         
868                         //get the list info from each file
869                         for (int k = 0; k < listNames.size(); k++) {
870         
871                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } if (rabund != NULL) { delete rabund; } return 0; }
872                                 
873                                 InputData* input = new InputData(listNames[k], "list");
874                                 ListVector* list = input->getListVector(thisLabel);
875                                 
876                                 //this file has reached the end
877                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
878                                 else {          
879                                         for (int j = 0; j < list->getNumBins(); j++) {
880                                                 //outList << list->get(j) << '\t';
881                         completeList.push_back(list->get(j));
882                                                 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
883                                         }
884                                         delete list;
885                                 }
886                                 delete input;
887                         }
888                         
889             if (countfile == "") {
890                 SAbundVector sabund = rabund->getSAbundVector();
891                 sabund.print(outSabund);
892                 rabund->print(outRabund);
893             }
894                         //outList << endl;
895             if (!m->printedListHeaders) { 
896                 m->listBinLabelsInFile.clear(); completeList.printHeaders(outList); }
897             completeList.print(outList);
898                         
899                         if (rabund != NULL) { delete rabund; }
900                 }
901                 
902                 outList.close();
903         if (countfile == "") {
904             outRabund.close();
905             outSabund.close();
906                 }
907                 if (listSingle != NULL) { delete listSingle;  }
908                 
909                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
910                 
911                 return 0;
912         }
913         catch(exception& e) {
914                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
915                 exit(1);
916         }
917 }
918
919 //**********************************************************************************************************************
920
921 void ClusterSplitCommand::printData(ListVector* oldList){
922         try {
923                 string label = oldList->getLabel();
924                 RAbundVector oldRAbund = oldList->getRAbundVector();
925                 
926                 oldRAbund.setLabel(label);
927                 if (m->isTrue(showabund)) {
928                         oldRAbund.getSAbundVector().print(cout);
929                 }
930                 oldRAbund.print(outRabund);
931                 oldRAbund.getSAbundVector().print(outSabund);
932         
933                 oldList->print(outList);
934         }
935         catch(exception& e) {
936                 m->errorOut(e, "ClusterSplitCommand", "printData");
937                 exit(1);
938         }
939 }
940 //**********************************************************************************************************************
941 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
942         try {
943         
944         vector<string> listFiles;
945         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
946         dividedNames.resize(processors);
947         
948         //for each file group figure out which process will complete it
949         //want to divide the load intelligently so the big files are spread between processes
950         for (int i = 0; i < distName.size(); i++) { 
951             //cout << i << endl;
952             int processToAssign = (i+1) % processors; 
953             if (processToAssign == 0) { processToAssign = processors; }
954             
955             dividedNames[(processToAssign-1)].push_back(distName[i]);
956             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
957         }
958         
959         //now lets reverse the order of ever other process, so we balance big files running with little ones
960         for (int i = 0; i < processors; i++) {
961             //cout << i << endl;
962             int remainder = ((i+1) % processors);
963             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
964         }
965         
966         if (m->control_pressed) { return listFiles; }
967         
968         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
969                 int process = 1;
970                 processIDS.clear();
971                 
972                 //loop through and create all the processes you want
973                 while (process != processors) {
974                         int pid = fork();
975                         
976                         if (pid > 0) {
977                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
978                                 process++;
979                         }else if (pid == 0){
980                                 set<string> labels;
981                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
982                                 
983                                 //write out names to file
984                                 string filename = toString(getpid()) + ".temp";
985                                 ofstream out;
986                                 m->openOutputFile(filename, out);
987                                 out << tag << endl;
988                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
989                                 out.close();
990                                 
991                                 //print out labels
992                                 ofstream outLabels;
993                                 filename = toString(getpid()) + ".temp.labels";
994                                 m->openOutputFile(filename, outLabels);
995                                 
996                                 outLabels << cutoff << endl;
997                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
998                                         outLabels << (*it) << endl;
999                                 }
1000                                 outLabels.close();
1001
1002                                 exit(0);
1003                         }else { 
1004                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1005                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1006                                 exit(0);
1007                         }
1008                 }
1009                 
1010         //do your part
1011         listFiles = cluster(dividedNames[0], labels);
1012         
1013                 //force parent to wait until all the processes are done
1014                 for (int i=0;i< processIDS.size();i++) { 
1015                         int temp = processIDS[i];
1016                         wait(&temp);
1017                 }
1018         
1019         //get list of list file names from each process
1020         for(int i=0;i<processIDS.size();i++){
1021             string filename = toString(processIDS[i]) + ".temp";
1022             ifstream in;
1023             m->openInputFile(filename, in);
1024             
1025             in >> tag; m->gobble(in);
1026             
1027             while(!in.eof()) {
1028                 string tempName;
1029                 in >> tempName; m->gobble(in);
1030                 listFiles.push_back(tempName);
1031             }
1032             in.close();
1033             m->mothurRemove((toString(processIDS[i]) + ".temp"));
1034             
1035             //get labels
1036             filename = toString(processIDS[i]) + ".temp.labels";
1037             ifstream in2;
1038             m->openInputFile(filename, in2);
1039             
1040             float tempCutoff;
1041             in2 >> tempCutoff; m->gobble(in2);
1042             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1043             
1044             while(!in2.eof()) {
1045                 string tempName;
1046                 in2 >> tempName; m->gobble(in2);
1047                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1048             }
1049             in2.close();
1050             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1051         }
1052         
1053
1054     #else
1055        
1056         //////////////////////////////////////////////////////////////////////////////////////////////////////
1057                 //Windows version shared memory, so be careful when passing variables through the clusterData struct. 
1058                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1059                 //Taking advantage of shared memory to allow both threads to add labels.
1060                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1061                 /*
1062                 vector<clusterData*> pDataArray; 
1063                 DWORD   dwThreadIdArray[processors-1];
1064                 HANDLE  hThreadArray[processors-1]; 
1065                 
1066                 //Create processor worker threads.
1067                 for( int i=1; i<processors; i++ ){
1068                         // Allocate memory for thread data.
1069                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1070                         pDataArray.push_back(tempCluster);
1071                         processIDS.push_back(i);
1072             
1073                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1074                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1075                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);  
1076             
1077                 }
1078         
1079         //do your part
1080         listFiles = cluster(dividedNames[0], labels);
1081         
1082                 //Wait until all threads have terminated.
1083                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1084                 
1085                 //Close all thread handles and free memory allocations.
1086                 for(int i=0; i < pDataArray.size(); i++){
1087             //get tag
1088             tag = pDataArray[i]->tag;
1089             //get listfiles created
1090             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1091             //get labels
1092             set<string>::iterator it;
1093             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1094                         //check cutoff
1095             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1096                         CloseHandle(hThreadArray[i]);
1097                         delete pDataArray[i];
1098                 }
1099 */
1100         #endif          
1101         
1102         return listFiles;
1103         
1104         }
1105         catch(exception& e) {
1106                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1107                 exit(1);
1108         }
1109 }
1110 //**********************************************************************************************************************
1111
1112 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1113         try {
1114                 
1115                 vector<string> listFileNames;
1116                 double smallestCutoff = cutoff;
1117                 
1118                 //cluster each distance file
1119                 for (int i = 0; i < distNames.size(); i++) {
1120             
1121                         string thisNamefile = distNames[i].begin()->second;
1122                         string thisDistFile = distNames[i].begin()->first;
1123                         
1124                         string listFileName = "";
1125             if (classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff);   }
1126             else            {  listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff);          }
1127
1128                         if (m->control_pressed) { //clean up
1129                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1130                                 listFileNames.clear(); return listFileNames;
1131                         }
1132             
1133             listFileNames.push_back(listFileName);
1134         }
1135                 
1136                 cutoff = smallestCutoff;
1137                                         
1138                 return listFileNames;
1139         
1140         }
1141         catch(exception& e) {
1142                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1143                 exit(1);
1144         }
1145
1146
1147 }
1148 //**********************************************************************************************************************
1149 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1150         try {
1151         string listFileName = "";
1152         
1153         ListVector* list = NULL;
1154         ListVector oldList;
1155         RAbundVector* rabund = NULL;
1156         
1157 #ifdef USE_MPI
1158         int pid;
1159         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1160         
1161         //output your files too
1162         if (pid != 0) {
1163             cout << endl << "Reading " << thisDistFile << endl;
1164         }
1165 #endif
1166
1167         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1168         
1169         //reads phylip file storing data in 2D vector, also fills list and rabund
1170         bool sim = false;
1171                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1172         
1173         NameAssignment* nameMap = NULL;
1174         CountTable* ct = NULL;
1175         if(namefile != ""){     
1176                         nameMap = new NameAssignment(thisNamefile);
1177                         nameMap->readMap();
1178             cluster->readPhylipFile(thisDistFile, nameMap);
1179                 }else if (countfile != "") {
1180             ct = new CountTable();
1181             ct->readTable(thisNamefile, false, false);
1182             cluster->readPhylipFile(thisDistFile, ct);
1183         }
1184         tag = cluster->getTag();
1185         
1186                 if (m->control_pressed) { if(namefile != ""){   delete nameMap; }
1187             else { delete ct; } delete cluster; return 0; }
1188                 
1189                 list = cluster->getListVector();
1190                 rabund = cluster->getRAbundVector();
1191         
1192                 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1193                 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1194         listFileName = fileroot+ tag + ".list";
1195         
1196         ofstream listFile;
1197                 m->openOutputFile(fileroot+ tag + ".list",      listFile);
1198                 
1199                 float previousDist = 0.00000;
1200                 float rndPreviousDist = 0.00000;
1201                 oldList = *list;
1202                 
1203 #ifdef USE_MPI
1204         //output your files too
1205         if (pid != 0) {
1206             cout << endl << "Clustering " << thisDistFile << endl;
1207         }
1208 #endif
1209
1210         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1211         
1212                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1213                         if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  if(namefile != ""){    delete nameMap; }
1214                 else { delete ct; } return listFileName;  }
1215             
1216                         cluster->update(cutoff);
1217             
1218                         float dist = cluster->getSmallDist();
1219                         float rndDist;
1220                         if (hard) {
1221                                 rndDist = m->ceilDist(dist, precision); 
1222                         }else{
1223                                 rndDist = m->roundDist(dist, precision); 
1224                         }
1225             
1226             if(previousDist <= 0.0000 && dist != previousDist){
1227                 oldList.setLabel("unique");
1228                 oldList.print(listFile);
1229                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1230             }
1231             else if(rndDist != rndPreviousDist){
1232                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1233                 oldList.print(listFile);
1234                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1235             }
1236
1237             
1238                         previousDist = dist;
1239                         rndPreviousDist = rndDist;
1240                         oldList = *list;
1241                 }
1242         
1243                 if(previousDist <= 0.0000){
1244             oldList.setLabel("unique");
1245             oldList.print(listFile);
1246             if (labels.count("unique") == 0) { labels.insert("unique"); }
1247         }
1248         else if(rndPreviousDist<cutoff){
1249             oldList.setLabel(toString(rndPreviousDist,  length-1));
1250             oldList.print(listFile);
1251             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1252         }
1253
1254         
1255                 listFile.close();
1256                 
1257                 delete cluster;  delete list; delete rabund;
1258         if(namefile != ""){     delete nameMap; }
1259         else { delete ct; }
1260         
1261         m->mothurRemove(thisDistFile);
1262         m->mothurRemove(thisNamefile);
1263         
1264         return listFileName;
1265         
1266         }
1267         catch(exception& e) {
1268                 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1269                 exit(1);
1270         }
1271 }
1272
1273 //**********************************************************************************************************************
1274 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1275         try {
1276         string listFileName = "";
1277         
1278         Cluster* cluster = NULL;
1279         SparseDistanceMatrix* matrix = NULL;
1280         ListVector* list = NULL;
1281         ListVector oldList;
1282         RAbundVector* rabund = NULL;
1283         
1284         if (m->control_pressed) { return listFileName; }
1285         
1286 #ifdef USE_MPI
1287         int pid;
1288         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1289         
1290         //output your files too
1291         if (pid != 0) {
1292             cout << endl << "Reading " << thisDistFile << endl;
1293         }
1294 #endif
1295         
1296         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1297         
1298         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
1299         read->setCutoff(cutoff);
1300         
1301         NameAssignment* nameMap = NULL;
1302         CountTable* ct = NULL;
1303                 if(namefile != ""){     
1304                         nameMap = new NameAssignment(thisNamefile);
1305                         nameMap->readMap();
1306             read->read(nameMap);
1307                 }else if (countfile != "") {
1308             ct = new CountTable();
1309             ct->readTable(thisNamefile, false, false);
1310             read->read(ct);
1311         }else { read->read(nameMap); }
1312                 
1313                 list = read->getListVector();
1314         oldList = *list;
1315                 matrix = read->getDMatrix();
1316         
1317                 if(countfile != "") {
1318             rabund = new RAbundVector();
1319             createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1320             delete ct;
1321         }else { rabund = new RAbundVector(list->getRAbundVector()); }
1322
1323         delete read;  read = NULL;
1324         if (namefile != "") { delete nameMap; nameMap = NULL; }
1325         
1326         
1327 #ifdef USE_MPI
1328         //output your files too
1329         if (pid != 0) {
1330             cout << endl << "Clustering " << thisDistFile << endl;
1331         }
1332 #endif
1333         
1334         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1335                 
1336         //create cluster
1337         float adjust = -1.0;
1338         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method, adjust); }
1339         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method, adjust); }
1340         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method, adjust);     }
1341         tag = cluster->getTag();
1342                 
1343         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1344         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1345         
1346         ofstream listFile;
1347         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1348                 
1349         listFileName = fileroot+ tag + ".list";
1350         
1351         float previousDist = 0.00000;
1352         float rndPreviousDist = 0.00000;
1353         
1354         oldList = *list;
1355         
1356         print_start = true;
1357         start = time(NULL);
1358         double saveCutoff = cutoff;
1359                 
1360         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1361             
1362             if (m->control_pressed) { //clean up
1363                 delete matrix; delete list;     delete cluster; delete rabund;
1364                 listFile.close();
1365                 m->mothurRemove(listFileName);  
1366                 return listFileName;
1367             }
1368             
1369             cluster->update(saveCutoff);
1370             
1371             float dist = matrix->getSmallDist();
1372             float rndDist;
1373             if (hard) {
1374                 rndDist = m->ceilDist(dist, precision); 
1375             }else{
1376                 rndDist = m->roundDist(dist, precision); 
1377             }
1378             
1379             if(previousDist <= 0.0000 && dist != previousDist){
1380                 oldList.setLabel("unique");
1381                 oldList.print(listFile);
1382                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1383             }
1384             else if(rndDist != rndPreviousDist){
1385                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1386                 oldList.print(listFile);
1387                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1388             }
1389             
1390             previousDist = dist;
1391             rndPreviousDist = rndDist;
1392             oldList = *list;
1393         }
1394         
1395                 
1396         if(previousDist <= 0.0000){
1397             oldList.setLabel("unique");
1398             oldList.print(listFile);
1399             if (labels.count("unique") == 0) { labels.insert("unique"); }
1400         }
1401         else if(rndPreviousDist<cutoff){
1402             oldList.setLabel(toString(rndPreviousDist,  length-1));
1403             oldList.print(listFile);
1404             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1405         }
1406         
1407         delete matrix; delete list;     delete cluster; delete rabund; 
1408         matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1409         listFile.close();
1410         
1411         if (m->control_pressed) { //clean up
1412             m->mothurRemove(listFileName);      
1413             return listFileName;
1414         }
1415         
1416         m->mothurRemove(thisDistFile);
1417         m->mothurRemove(thisNamefile);
1418         
1419         if (saveCutoff != cutoff) { 
1420             if (hard)   {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1421             else                {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1422                         
1423             m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  
1424         }
1425         
1426         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1427         
1428         return listFileName;
1429         
1430         }
1431         catch(exception& e) {
1432                 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1433                 exit(1);
1434         }
1435 }
1436 //**********************************************************************************************************************
1437
1438 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1439         try{
1440                 
1441 #ifdef USE_MPI
1442                 int pid;
1443                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1444                 
1445                 if (pid != 0) {
1446 #endif
1447                 
1448                 string thisOutputDir = outputDir;
1449                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1450         map<string, string> variables; 
1451         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
1452                 string outputFileName = getOutputFileName("column", variables);
1453                 m->mothurRemove(outputFileName);
1454                 
1455                 
1456                 for (int i = 0; i < distNames.size(); i++) {
1457                         if (m->control_pressed) {  return 0; }
1458                         
1459                         string thisDistFile = distNames[i].begin()->first;
1460                         
1461                         m->appendFiles(thisDistFile, outputFileName);
1462                 }       
1463                         
1464                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1465                         
1466 #ifdef USE_MPI
1467                 }
1468 #endif
1469                                 
1470                 return 0;       
1471                 
1472                 
1473         }
1474         catch(exception& e) {
1475                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1476                 exit(1);
1477         }
1478 }
1479 //**********************************************************************************************************************
1480 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1481     try {
1482         rabund->setLabel(list->getLabel());        
1483         for(int i = 0; i < list->getNumBins(); i++) { 
1484             if (m->control_pressed) { break; }
1485             vector<string> binNames;
1486             string bin = list->get(i);
1487             m->splitAtComma(bin, binNames);
1488             int total = 0;
1489             for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
1490             rabund->push_back(total);   
1491         }
1492         return 0;
1493     }
1494     catch(exception& e) {
1495                 m->errorOut(e, "ClusterCommand", "createRabund");
1496                 exit(1);
1497         }
1498     
1499 }
1500 //**********************************************************************************************************************