]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
update .gitignore
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11
12
13 //**********************************************************************************************************************
14 vector<string> ClusterSplitCommand::setParameters(){    
15         try {
16         CommandParameter pfile("file", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","",false,false,true); parameters.push_back(pfile);
17                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
18                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
19                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
20                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
21         CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
22                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
23                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
24                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "","",false,false,true); parameters.push_back(psplitmethod);
25                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
26                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
27         CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
28                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
29                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
30                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff);
31                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
32                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false); parameters.push_back(pmethod);
33                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
34         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
35                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
36                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
37                         
38                 vector<string> myArray;
39                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
40                 return myArray;
41         }
42         catch(exception& e) {
43                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
44                 exit(1);
45         }
46 }
47 //**********************************************************************************************************************
48 string ClusterSplitCommand::getHelpString(){    
49         try {
50                 string helpString = "";
51                 helpString += "The cluster.split command parameter options are file, fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
52                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
53                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
54                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
55                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
56                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
57                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
58         helpString += "The file option allows you to enter your file containing your list of column and names/count files as well as the singleton file.  This file is mothur generated, when you run cluster.split() with the cluster=f parameter.  This can be helpful when you have a large dataset that you may be able to use all your processors for the splitting step, but have to reduce them for the cluster step due to RAM constraints. For example: cluster.split(fasta=yourFasta, taxonomy=yourTax, count=yourCount, taxlevel=3, cluster=f, processors=8) then cluster.split(file=yourFile, processors=4).  This allows your to maximize your processors during the splitting step.  Also, if you are unsure if the cluster step will have RAM issue with multiple processors, you can avoid running the first part of the command multiple times.\n";
59                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
60                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
61                 helpString += "The name parameter allows you to enter your name file. \n";
62         helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
63         helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
64                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
65                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
66                 helpString += "The method allows you to specify what clustering algorithm you want to use, default=average, option furthest, nearest, or average. \n";
67                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
68                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
69                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
70                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
71         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic.  It is only valid with splitmethod=fasta. Default=f.\n";
72 #ifdef USE_MPI
73                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
74 #endif
75                 helpString += "The cluster.split command should be in the following format: \n";
76                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
77                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";       
78                 return helpString;
79         }
80         catch(exception& e) {
81                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
82                 exit(1);
83         }
84 }
85 //**********************************************************************************************************************
86 string ClusterSplitCommand::getOutputPattern(string type) {
87     try {
88         string pattern = "";
89         
90         if (type == "list") {  pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; } 
91         else if (type == "rabund") {  pattern = "[filename],[clustertag],rabund"; } 
92         else if (type == "sabund") {  pattern = "[filename],[clustertag],sabund"; }
93         else if (type == "column") {  pattern = "[filename],dist"; }
94         else if (type == "file")   {  pattern = "[filename],file"; }
95         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
96         
97         return pattern;
98     }
99     catch(exception& e) {
100         m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
101         exit(1);
102     }
103 }
104 //**********************************************************************************************************************
105 ClusterSplitCommand::ClusterSplitCommand(){     
106         try {
107                 abort = true; calledHelp = true; 
108                 setParameters();
109                 vector<string> tempOutNames;
110                 outputTypes["list"] = tempOutNames;
111                 outputTypes["rabund"] = tempOutNames;
112                 outputTypes["sabund"] = tempOutNames;
113                 outputTypes["column"] = tempOutNames;
114         outputTypes["file"] = tempOutNames;
115         }
116         catch(exception& e) {
117                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
118                 exit(1);
119         }
120 }
121 //**********************************************************************************************************************
122 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
123 ClusterSplitCommand::ClusterSplitCommand(string option)  {
124         try{
125                 abort = false; calledHelp = false;   
126                 format = "";
127                 
128                 //allow user to run help
129                 if(option == "help") { help(); abort = true; calledHelp = true; }
130                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
131                 
132                 else {
133                         vector<string> myArray = setParameters();
134                         
135                         OptionParser parser(option);
136                         map<string,string> parameters = parser.getParameters();
137                         
138                         ValidParameters validParameter("cluster.split");
139                 
140                         //check to make sure all parameters are valid for command
141                         map<string,string>::iterator it;
142                         for (it = parameters.begin(); it != parameters.end(); it++) { 
143                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
144                                         abort = true;
145                                 }
146                         }
147                         
148                         //initialize outputTypes
149                         vector<string> tempOutNames;
150                         outputTypes["list"] = tempOutNames;
151                         outputTypes["rabund"] = tempOutNames;
152                         outputTypes["sabund"] = tempOutNames;
153                         outputTypes["column"] = tempOutNames;
154             outputTypes["file"] = tempOutNames;
155                         
156                         //if the user changes the output directory command factory will send this info to us in the output parameter 
157                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
158                         
159                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
160                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
161                         if (inputDir == "not found"){   inputDir = "";          }
162                         else {
163                                 string path;
164                                 it = parameters.find("phylip");
165                                 //user has given a template file
166                                 if(it != parameters.end()){ 
167                                         path = m->hasPath(it->second);
168                                         //if the user has not given a path then, add inputdir. else leave path alone.
169                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
170                                 }
171                                 
172                                 it = parameters.find("column");
173                                 //user has given a template file
174                                 if(it != parameters.end()){ 
175                                         path = m->hasPath(it->second);
176                                         //if the user has not given a path then, add inputdir. else leave path alone.
177                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
178                                 }
179                                 
180                                 it = parameters.find("name");
181                                 //user has given a template file
182                                 if(it != parameters.end()){ 
183                                         path = m->hasPath(it->second);
184                                         //if the user has not given a path then, add inputdir. else leave path alone.
185                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
186                                 }
187                                 
188                                 it = parameters.find("taxonomy");
189                                 //user has given a template file
190                                 if(it != parameters.end()){ 
191                                         path = m->hasPath(it->second);
192                                         //if the user has not given a path then, add inputdir. else leave path alone.
193                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
194                                 }
195                                 
196                                 it = parameters.find("fasta");
197                                 //user has given a template file
198                                 if(it != parameters.end()){ 
199                                         path = m->hasPath(it->second);
200                                         //if the user has not given a path then, add inputdir. else leave path alone.
201                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
202                                 }
203                 
204                 it = parameters.find("count");
205                                 //user has given a template file
206                                 if(it != parameters.end()){ 
207                                         path = m->hasPath(it->second);
208                                         //if the user has not given a path then, add inputdir. else leave path alone.
209                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
210                                 }
211                 
212                 it = parameters.find("file");
213                                 //user has given a template file
214                                 if(it != parameters.end()){
215                                         path = m->hasPath(it->second);
216                                         //if the user has not given a path then, add inputdir. else leave path alone.
217                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
218                                 }
219                         }
220                         
221                         //check for required parameters
222                         file = validParameter.validFile(parameters, "file", true);
223                         if (file == "not open") { file = ""; abort = true; }
224                         else if (file == "not found") { file = ""; }
225             else { distfile = file; }
226             
227             phylipfile = validParameter.validFile(parameters, "phylip", true);
228                         if (phylipfile == "not open") { abort = true; }
229                         else if (phylipfile == "not found") { phylipfile = ""; }
230                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
231                         
232                         columnfile = validParameter.validFile(parameters, "column", true);
233                         if (columnfile == "not open") { abort = true; } 
234                         else if (columnfile == "not found") { columnfile = ""; }
235                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
236                         
237                         namefile = validParameter.validFile(parameters, "name", true);
238                         if (namefile == "not open") { abort = true; namefile = "";}     
239                         else if (namefile == "not found") { namefile = "";  }
240                         else { m->setNameFile(namefile); }
241             
242             countfile = validParameter.validFile(parameters, "count", true);
243                         if (countfile == "not open") { abort = true; countfile = "";}   
244                         else if (countfile == "not found") { countfile = "";  }
245                         else { m->setCountTableFile(countfile); }
246                         
247                         fastafile = validParameter.validFile(parameters, "fasta", true);
248                         if (fastafile == "not open") { abort = true; }  
249                         else if (fastafile == "not found") { fastafile = ""; }
250                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
251                         
252                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
253                         if (taxFile == "not open") { taxFile = ""; abort = true; }      
254                         else if (taxFile == "not found") { taxFile = ""; }
255                         else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
256                         
257                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "") && (file == "")) {
258                                 //is there are current file available for either of these?
259                                 //give priority to column, then phylip, then fasta
260                                 columnfile = m->getColumnFile(); 
261                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
262                                 else { 
263                                         phylipfile = m->getPhylipFile(); 
264                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
265                                         else { 
266                                                 fastafile = m->getFastaFile(); 
267                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
268                                                 else { 
269                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a file, phylip or a column or fastafile."); m->mothurOutEndLine();
270                                                         abort = true; 
271                                                 }
272                                         }
273                                 }
274                         }
275                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "") && (file != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: file, fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
276             
277             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
278             
279                         if (columnfile != "") {
280                                 if ((namefile == "") && (countfile == "")) { 
281                                         namefile = m->getNameFile(); 
282                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
283                                         else { 
284                                                 countfile = m->getCountTableFile();
285                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
286                         else { 
287                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine(); 
288                             abort = true; 
289                         }       
290                                         }       
291                                 }
292                         }
293                         
294                         if (fastafile != "") {
295                                 if (taxFile == "") { 
296                                         taxFile = m->getTaxonomyFile(); 
297                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
298                                         else { 
299                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine(); 
300                                                 abort = true; 
301                                         }       
302                                 }
303                                 
304                                 if ((namefile == "") && (countfile == "")) { 
305                                         namefile = m->getNameFile(); 
306                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
307                                         else { 
308                                                 countfile = m->getCountTableFile();
309                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
310                         else { 
311                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine(); 
312                             abort = true; 
313                         }       
314                                         }       
315                                 }
316                         }
317                                         
318                         //check for optional parameter and set defaults
319                         // ...at some point should added some additional type checking...
320                         //get user cutoff and precision or use defaults
321                         string temp;
322                         temp = validParameter.validFile(parameters, "precision", false);
323                         if (temp == "not found") { temp = "100"; }
324                         //saves precision legnth for formatting below
325                         length = temp.length();
326                         m->mothurConvert(temp, precision); 
327                         
328                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
329                         hard = m->isTrue(temp);
330                         
331                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
332                         large = m->isTrue(temp);
333             
334                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
335                         m->setProcessors(temp);
336                         m->mothurConvert(temp, processors);
337                         
338                         temp = validParameter.validFile(parameters, "splitmethod", false);      
339                         if ((splitmethod != "fasta") && (splitmethod != "classify")) {
340                                 if (temp == "not found")  { splitmethod = "distance"; }
341                                 else {  splitmethod = temp; }
342                         }
343                         
344             temp = validParameter.validFile(parameters, "classic", false);                      if (temp == "not found") { temp = "F"; }
345                         classic = m->isTrue(temp);
346             
347             if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
348
349                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
350                         m->mothurConvert(temp, cutoff); 
351                         cutoff += (5 / (precision * 10.0));  
352                         
353                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
354                         m->mothurConvert(temp, taxLevelCutoff); 
355                         
356                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
357                         
358                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
359                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
360                         
361                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
362                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
363                         
364                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
365
366                         showabund = validParameter.validFile(parameters, "showabund", false);
367                         if (showabund == "not found") { showabund = "T"; }
368             
369             temp = validParameter.validFile(parameters, "cluster", false);  if (temp == "not found") { temp = "T"; }
370             runCluster = m->isTrue(temp);
371
372                         timing = validParameter.validFile(parameters, "timing", false);
373                         if (timing == "not found") { timing = "F"; }
374                         
375                 }
376         }
377         catch(exception& e) {
378                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
379                 exit(1);
380         }
381 }
382
383 //**********************************************************************************************************************
384
385 int ClusterSplitCommand::execute(){
386         try {
387         
388                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
389         
390         time_t estart;
391         vector<string> listFileNames;
392         vector< map<string, string> > distName;
393         set<string> labels;
394         string singletonName = "";
395         
396         double saveCutoff = cutoff;
397
398         if (file != "") {  deleteFiles = false; estart = time(NULL); singletonName = readFile(distName); }
399         else {
400                           
401             //****************** file prep work ******************************//
402 #ifdef USE_MPI
403                         int pid;
404                         int tag = 2001;
405                         MPI_Status status;
406                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
407                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
408                         
409                         if (pid == 0) { //only process 0 converts and splits
410                 
411 #endif
412                 
413                 //if user gave a phylip file convert to column file
414                 if (format == "phylip") {
415                     estart = time(NULL);
416                     m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
417                     
418                     ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
419                     
420                     NameAssignment* nameMap = NULL;
421                     convert->setFormat("phylip");
422                     convert->read(nameMap);
423                     
424                     if (m->control_pressed) {  delete convert;  return 0;  }
425                     
426                     distfile = convert->getOutputFile();
427                     
428                     //if no names file given with phylip file, create it
429                     ListVector* listToMakeNameFile =  convert->getListVector();
430                     if ((namefile == "") && (countfile == "")) {  //you need to make a namefile for split matrix
431                         ofstream out;
432                         namefile = phylipfile + ".names";
433                         m->openOutputFile(namefile, out);
434                         for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
435                             string bin = listToMakeNameFile->get(i);
436                             out << bin << '\t' << bin << endl;
437                         }
438                         out.close();
439                     }
440                     delete listToMakeNameFile;
441                     delete convert;
442                     
443                     m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
444                 }
445                 if (m->control_pressed) { return 0; }
446                 
447                 estart = time(NULL);
448                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
449                 
450                 //split matrix into non-overlapping groups
451                 SplitMatrix* split;
452                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large);                                                    }
453                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large);                                    }
454                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);  }
455                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
456                 
457                 split->split();
458                 
459                 if (m->control_pressed) { delete split; return 0; }
460                 
461                 singletonName = split->getSingletonNames();
462                 distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
463                 delete split;
464                 
465                 if (m->debug) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); }
466                 
467                 //output a merged distance file
468                 //if (splitmethod == "fasta")           { createMergedDistanceFile(distName); }
469                                 
470                 if (m->control_pressed) { return 0; }
471                 
472                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
473                 estart = time(NULL);
474                 
475                 if (!runCluster) { 
476                     printFile(singletonName, distName);
477                     
478                     m->mothurOutEndLine();
479                     m->mothurOut("Output File Names: "); m->mothurOutEndLine();
480                     for (int i = 0; i < distName.size(); i++) { m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine();      }
481                     m->mothurOutEndLine();
482                     return 0;
483                     
484                 }
485                 deleteFiles = true;
486             }
487                 //****************** break up files between processes and cluster each file set ******************************//
488         #ifdef USE_MPI
489                         ////you are process 0 from above////
490                         
491                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...                           
492                         dividedNames.resize(processors);
493                                         
494                         //for each file group figure out which process will complete it
495                         //want to divide the load intelligently so the big files are spread between processes
496                         for (int i = 0; i < distName.size(); i++) { 
497                                 int processToAssign = (i+1) % processors; 
498                                 if (processToAssign == 0) { processToAssign = processors; }
499                                                 
500                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
501                         }
502                                         
503                         //not lets reverse the order of ever other process, so we balance big files running with little ones
504                         for (int i = 0; i < processors; i++) {
505                                 int remainder = ((i+1) % processors);
506                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
507                         }
508                         
509                         
510                         //send each child the list of files it needs to process
511                         for(int i = 1; i < processors; i++) { 
512                                 //send number of file pairs
513                                 int num = dividedNames[i].size();
514                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
515                                 
516                                 for (int j = 0; j < num; j++) { //send filenames to process i
517                                         char tempDistFileName[1024];
518                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
519                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
520                                         
521                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
522                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
523                                         
524                                         char tempNameFileName[1024];
525                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
526                                         int lengthName = (dividedNames[i][j].begin()->second).length();
527
528                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
529                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
530                                 }
531                         }
532                         
533                         //process your share
534                         listFileNames = cluster(dividedNames[0], labels);
535                         
536                         //receive the other processes info
537                         for(int i = 1; i < processors; i++) { 
538                                 int num = dividedNames[i].size();
539                                 
540                                 double tempCutoff;
541                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
542                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
543                                 
544                                 //send list filenames to root process
545                                 for (int j = 0; j < num; j++) {  
546                                         int lengthList = 0;
547                                         char tempListFileName[1024];
548                                 
549                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
550                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
551                                 
552                                         string myListFileName = tempListFileName;
553                                         myListFileName = myListFileName.substr(0, lengthList);
554                                         
555                                         listFileNames.push_back(myListFileName);
556                                 }
557                                 
558                                 //send Labels to root process
559                                 int numLabels = 0;
560                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
561                                 
562                                 for (int j = 0; j < numLabels; j++) {  
563                                         int lengthLabel = 0;
564                                         char tempLabel[100];
565                                 
566                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
567                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
568                                 
569                                         string myLabel = tempLabel;
570                                         myLabel = myLabel.substr(0, lengthLabel);
571                         
572                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
573                                 }
574                         }
575                         
576                 }else { //you are a child process
577                         vector < map<string, string> >  myNames;
578                         
579                         //recieve the files you need to process
580                         //receive number of file pairs
581                         int num = 0;
582                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
583                         
584                         myNames.resize(num);
585         
586                         for (int j = 0; j < num; j++) { //receive filenames to process 
587                                 int lengthDist = 0;
588                                 char tempDistFileName[1024];
589                                 
590                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
591                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
592                                 
593                                 string myDistFileName = tempDistFileName;
594                                 myDistFileName = myDistFileName.substr(0, lengthDist);
595                         
596                                 int lengthName = 0;
597                                 char tempNameFileName[1024];
598                                 
599                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
600                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
601                                 
602                                 string myNameFileName = tempNameFileName;
603                                 myNameFileName = myNameFileName.substr(0, lengthName);
604                                 
605                                 //save file name
606                                 myNames[j][myDistFileName] = myNameFileName;
607                         }
608         
609                         //process them
610                         listFileNames = cluster(myNames, labels);
611                         
612                         //send cutoff
613                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
614                         
615                         //send list filenames to root process
616                         for (int j = 0; j < num; j++) {  
617                                 char tempListFileName[1024];
618                                 strcpy(tempListFileName, listFileNames[j].c_str());
619                                 int lengthList = listFileNames[j].length();
620                                         
621                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
622                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
623                         }
624                         
625                         //send Labels to root process
626                         int numLabels = labels.size();
627                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
628                         
629                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
630                                 char tempLabel[100];
631                                 strcpy(tempLabel, (*it).c_str());
632                                 int lengthLabel = (*it).length();
633                                         
634                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
635                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
636                         }
637                 }
638                 
639                 //make everyone wait
640                 MPI_Barrier(MPI_COMM_WORLD);
641                 
642         #else
643                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
644                 //sanity check
645                 if (processors > distName.size()) { processors = distName.size(); }
646                 
647                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
648                                 if(processors == 1){
649                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
650                                 }else{
651                                         listFileNames = createProcesses(distName, labels);
652                 }
653                 #else
654                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
655                 #endif
656         #endif  
657                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
658                 
659                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
660                 
661                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
662                 
663                 //****************** merge list file and create rabund and sabund files ******************************//
664                 estart = time(NULL);
665                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
666                 
667                 #ifdef USE_MPI
668                         if (pid == 0) { //only process 0 merges
669                 #endif
670
671                 ListVector* listSingle;
672                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
673                 
674                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
675                 
676                 mergeLists(listFileNames, labelBins, listSingle);
677
678                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
679         
680         //delete after all are complete incase a crash happens
681         if (!deleteFiles) { for (int i = 0; i < distName.size(); i++) { m->mothurRemove(distName[i].begin()->first); m->mothurRemove(distName[i].begin()->second);      } }
682                 
683                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
684                 
685                 //set list file as new current listfile
686                 string current = "";
687                 itTypes = outputTypes.find("list");
688                 if (itTypes != outputTypes.end()) {
689                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
690                 }
691                 
692                 //set rabund file as new current rabundfile
693                 itTypes = outputTypes.find("rabund");
694                 if (itTypes != outputTypes.end()) {
695                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
696                 }
697                 
698                 //set sabund file as new current sabundfile
699                 itTypes = outputTypes.find("sabund");
700                 if (itTypes != outputTypes.end()) {
701                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
702                 }
703                                 
704                 m->mothurOutEndLine();
705                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
706                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
707                 m->mothurOutEndLine();
708                 
709                 #ifdef USE_MPI
710                         } //only process 0 merges
711                         
712                         //make everyone wait
713                         MPI_Barrier(MPI_COMM_WORLD);
714                 #endif
715
716                 return 0;
717         }
718         catch(exception& e) {
719                 m->errorOut(e, "ClusterSplitCommand", "execute");
720                 exit(1);
721         }
722 }
723 //**********************************************************************************************************************
724 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
725         try {
726                                 
727                 map<float, int> labelBin;
728                 vector<float> orderFloat;
729                 int numSingleBins;
730                 
731                 //read in singletons
732                 if (singleton != "none") {
733             
734             ifstream in;
735             m->openInputFile(singleton, in);
736                                 
737                         string firstCol, secondCol;
738                         listSingle = new ListVector();
739             
740             if (countfile != "") { m->getline(in); m->gobble(in); }
741             
742                         while (!in.eof()) {
743                                 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
744                                 if (countfile == "") { listSingle->push_back(secondCol); }
745                 else { listSingle->push_back(firstCol); }
746                         }
747             
748                         in.close();
749                         m->mothurRemove(singleton);
750                         
751                         numSingleBins = listSingle->getNumBins();
752                 }else{  listSingle = NULL; numSingleBins = 0;  }
753                 
754                 //go through users set and make them floats so we can sort them 
755                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
756                         float temp = -10.0;
757
758                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
759                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
760                         
761                         if (temp <= cutoff) {
762                                 orderFloat.push_back(temp);
763                                 labelBin[temp] = numSingleBins; //initialize numbins 
764                         }
765                 }
766         
767                 //sort order
768                 sort(orderFloat.begin(), orderFloat.end());
769                 userLabels.clear();
770                         
771                 //get the list info from each file
772                 for (int k = 0; k < listNames.size(); k++) {
773         
774                         if (m->control_pressed) {  
775                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
776                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
777                                 return labelBin;
778                         }
779                         
780                         InputData* input = new InputData(listNames[k], "list");
781                         ListVector* list = input->getListVector();
782                         string lastLabel = list->getLabel();
783                         
784                         string filledInList = listNames[k] + "filledInTemp";
785                         ofstream outFilled;
786                         m->openOutputFile(filledInList, outFilled);
787         
788                         //for each label needed
789                         for(int l = 0; l < orderFloat.size(); l++){
790                         
791                                 string thisLabel;
792                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
793                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
794
795                                 //this file has reached the end
796                                 if (list == NULL) { 
797                                         list = input->getListVector(lastLabel, true); 
798                                 }else{  //do you have the distance, or do you need to fill in
799                                                 
800                                         float labelFloat;
801                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
802                                         else { convert(list->getLabel(), labelFloat); }
803
804                                         //check for missing labels
805                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
806                                                 //if its bigger get last label, otherwise keep it
807                                                 delete list;
808                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
809                                         }
810                                         lastLabel = list->getLabel();
811                                 }
812                                 
813                                 //print to new file
814                                 list->setLabel(thisLabel);
815                                 list->print(outFilled);
816                 
817                                 //update labelBin
818                                 labelBin[orderFloat[l]] += list->getNumBins();
819                                                                         
820                                 delete list;
821                                                                         
822                                 list = input->getListVector();
823                         }
824                         
825                         if (list != NULL) { delete list; }
826                         delete input;
827                         
828                         outFilled.close();
829                         m->mothurRemove(listNames[k]);
830                         rename(filledInList.c_str(), listNames[k].c_str());
831                 }
832                 
833                 return labelBin;
834         }
835         catch(exception& e) {
836                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
837                 exit(1);
838         }
839 }
840 //**********************************************************************************************************************
841 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
842         try {
843                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
844                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
845                 
846         map<string, string> variables; 
847         variables["[filename]"] = fileroot;
848         variables["[clustertag]"] = tag;
849         string sabundFileName = getOutputFileName("sabund", variables);
850         string rabundFileName = getOutputFileName("rabund", variables);
851         if (countfile != "") { variables["[tag2]"] = "unique_list"; }
852         string listFileName = getOutputFileName("list", variables);
853         
854         if (countfile == "") {
855             m->openOutputFile(sabundFileName,   outSabund);
856             m->openOutputFile(rabundFileName,   outRabund);
857             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
858             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
859             
860         }
861                 m->openOutputFile(listFileName, outList);
862         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
863                 
864                 map<float, int>::iterator itLabel;
865         
866         //clears out junk for autocompleting of list files above.  Perhaps there is a beter way to handle this from within the data structure?
867         m->printedListHeaders = false;
868
869                 //for each label needed
870                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
871                         
872                         string thisLabel;
873                         if (itLabel->first == -1) { thisLabel = "unique"; }
874                         else { thisLabel = toString(itLabel->first,  length-1);  } 
875                         
876                         //outList << thisLabel << '\t' << itLabel->second << '\t';
877             
878             RAbundVector* rabund = NULL;
879             ListVector completeList;
880             completeList.setLabel(thisLabel);
881             
882             if (countfile == "") {
883                 rabund = new RAbundVector();
884                 rabund->setLabel(thisLabel);
885             }
886
887                         //add in singletons
888                         if (listSingle != NULL) {
889                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
890                                         //outList << listSingle->get(j) << '\t';
891                     completeList.push_back(listSingle->get(j));
892                                         if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
893                                 }
894                         }
895                         
896                         //get the list info from each file
897                         for (int k = 0; k < listNames.size(); k++) {
898         
899                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } if (rabund != NULL) { delete rabund; } return 0; }
900                                 
901                                 InputData* input = new InputData(listNames[k], "list");
902                                 ListVector* list = input->getListVector(thisLabel);
903                                 
904                                 //this file has reached the end
905                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
906                                 else {          
907                                         for (int j = 0; j < list->getNumBins(); j++) {
908                                                 //outList << list->get(j) << '\t';
909                         completeList.push_back(list->get(j));
910                                                 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
911                                         }
912                                         delete list;
913                                 }
914                                 delete input;
915                         }
916                         
917             if (countfile == "") {
918                 SAbundVector sabund = rabund->getSAbundVector();
919                 sabund.print(outSabund);
920                 rabund->print(outRabund);
921             }
922                         //outList << endl;
923             if (!m->printedListHeaders) { 
924                 m->listBinLabelsInFile.clear(); completeList.printHeaders(outList); }
925             completeList.print(outList);
926                         
927                         if (rabund != NULL) { delete rabund; }
928                 }
929                 
930                 outList.close();
931         if (countfile == "") {
932             outRabund.close();
933             outSabund.close();
934                 }
935                 if (listSingle != NULL) { delete listSingle;  }
936                 
937                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
938                 
939                 return 0;
940         }
941         catch(exception& e) {
942                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
943                 exit(1);
944         }
945 }
946
947 //**********************************************************************************************************************
948
949 void ClusterSplitCommand::printData(ListVector* oldList){
950         try {
951                 string label = oldList->getLabel();
952                 RAbundVector oldRAbund = oldList->getRAbundVector();
953                 
954                 oldRAbund.setLabel(label);
955                 if (m->isTrue(showabund)) {
956                         oldRAbund.getSAbundVector().print(cout);
957                 }
958                 oldRAbund.print(outRabund);
959                 oldRAbund.getSAbundVector().print(outSabund);
960         
961                 oldList->print(outList);
962         }
963         catch(exception& e) {
964                 m->errorOut(e, "ClusterSplitCommand", "printData");
965                 exit(1);
966         }
967 }
968 //**********************************************************************************************************************
969 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
970         try {
971         
972         vector<string> listFiles;
973         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
974         dividedNames.resize(processors);
975         
976         //for each file group figure out which process will complete it
977         //want to divide the load intelligently so the big files are spread between processes
978         for (int i = 0; i < distName.size(); i++) { 
979             //cout << i << endl;
980             int processToAssign = (i+1) % processors; 
981             if (processToAssign == 0) { processToAssign = processors; }
982             
983             dividedNames[(processToAssign-1)].push_back(distName[i]);
984             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
985         }
986         
987         //now lets reverse the order of ever other process, so we balance big files running with little ones
988         for (int i = 0; i < processors; i++) {
989             //cout << i << endl;
990             int remainder = ((i+1) % processors);
991             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
992         }
993         
994         if (m->control_pressed) { return listFiles; }
995         
996         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
997                 int process = 1;
998                 processIDS.clear();
999                 
1000                 //loop through and create all the processes you want
1001                 while (process != processors) {
1002                         pid_t pid = fork();
1003                         
1004                         if (pid > 0) {
1005                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1006                                 process++;
1007                         }else if (pid == 0){
1008                                 set<string> labels;
1009                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
1010                                 
1011                                 //write out names to file
1012                                 string filename = m->mothurGetpid(process) + ".temp";
1013                                 ofstream out;
1014                                 m->openOutputFile(filename, out);
1015                                 out << tag << endl;
1016                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
1017                                 out.close();
1018                                 
1019                                 //print out labels
1020                                 ofstream outLabels;
1021                                 filename = m->mothurGetpid(process) + ".temp.labels";
1022                                 m->openOutputFile(filename, outLabels);
1023                                 
1024                                 outLabels << cutoff << endl;
1025                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
1026                                         outLabels << (*it) << endl;
1027                                 }
1028                                 outLabels.close();
1029
1030                                 exit(0);
1031                         }else { 
1032                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1033                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1034                                 exit(0);
1035                         }
1036                 }
1037                 
1038         //do your part
1039         listFiles = cluster(dividedNames[0], labels);
1040         
1041                 //force parent to wait until all the processes are done
1042                 for (int i=0;i< processIDS.size();i++) { 
1043                         int temp = processIDS[i];
1044                         wait(&temp);
1045                 }
1046         
1047         //get list of list file names from each process
1048         for(int i=0;i<processIDS.size();i++){
1049             string filename = toString(processIDS[i]) + ".temp";
1050             ifstream in;
1051             m->openInputFile(filename, in);
1052             
1053             in >> tag; m->gobble(in);
1054             
1055             while(!in.eof()) {
1056                 string tempName;
1057                 in >> tempName; m->gobble(in);
1058                 listFiles.push_back(tempName);
1059             }
1060             in.close();
1061             m->mothurRemove((toString(processIDS[i]) + ".temp"));
1062             
1063             //get labels
1064             filename = toString(processIDS[i]) + ".temp.labels";
1065             ifstream in2;
1066             m->openInputFile(filename, in2);
1067             
1068             float tempCutoff;
1069             in2 >> tempCutoff; m->gobble(in2);
1070             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1071             
1072             while(!in2.eof()) {
1073                 string tempName;
1074                 in2 >> tempName; m->gobble(in2);
1075                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1076             }
1077             in2.close();
1078             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1079         }
1080         
1081
1082     #else
1083        
1084         //////////////////////////////////////////////////////////////////////////////////////////////////////
1085                 //Windows version shared memory, so be careful when passing variables through the clusterData struct. 
1086                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1087                 //Taking advantage of shared memory to allow both threads to add labels.
1088                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1089                 /*
1090                 vector<clusterData*> pDataArray; 
1091                 DWORD   dwThreadIdArray[processors-1];
1092                 HANDLE  hThreadArray[processors-1]; 
1093                 
1094                 //Create processor worker threads.
1095                 for( int i=1; i<processors; i++ ){
1096                         // Allocate memory for thread data.
1097                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1098                         pDataArray.push_back(tempCluster);
1099                         processIDS.push_back(i);
1100             
1101                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1102                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1103                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);  
1104             
1105                 }
1106         
1107         //do your part
1108         listFiles = cluster(dividedNames[0], labels);
1109         
1110                 //Wait until all threads have terminated.
1111                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1112                 
1113                 //Close all thread handles and free memory allocations.
1114                 for(int i=0; i < pDataArray.size(); i++){
1115             //get tag
1116             tag = pDataArray[i]->tag;
1117             //get listfiles created
1118             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1119             //get labels
1120             set<string>::iterator it;
1121             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1122                         //check cutoff
1123             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1124                         CloseHandle(hThreadArray[i]);
1125                         delete pDataArray[i];
1126                 }
1127 */
1128         #endif          
1129         
1130         return listFiles;
1131         
1132         }
1133         catch(exception& e) {
1134                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1135                 exit(1);
1136         }
1137 }
1138 //**********************************************************************************************************************
1139
1140 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1141         try {
1142                 
1143                 vector<string> listFileNames;
1144                 double smallestCutoff = cutoff;
1145                 
1146                 //cluster each distance file
1147                 for (int i = 0; i < distNames.size(); i++) {
1148             
1149                         string thisNamefile = distNames[i].begin()->second;
1150                         string thisDistFile = distNames[i].begin()->first;
1151                         
1152                         string listFileName = "";
1153             if (classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff);   }
1154             else            {  listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff);          }
1155
1156                         if (m->control_pressed) { //clean up
1157                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1158                                 listFileNames.clear(); return listFileNames;
1159                         }
1160             
1161             listFileNames.push_back(listFileName);
1162         }
1163                 
1164                 cutoff = smallestCutoff;
1165                                         
1166                 return listFileNames;
1167         
1168         }
1169         catch(exception& e) {
1170                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1171                 exit(1);
1172         }
1173
1174
1175 }
1176 //**********************************************************************************************************************
1177 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1178         try {
1179         string listFileName = "";
1180         
1181         ListVector* list = NULL;
1182         ListVector oldList;
1183         RAbundVector* rabund = NULL;
1184         
1185 #ifdef USE_MPI
1186         int pid;
1187         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1188         
1189         //output your files too
1190         if (pid != 0) {
1191             cout << endl << "Reading " << thisDistFile << endl;
1192         }
1193 #endif
1194
1195         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1196         
1197         //reads phylip file storing data in 2D vector, also fills list and rabund
1198         bool sim = false;
1199                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1200         
1201         NameAssignment* nameMap = NULL;
1202         CountTable* ct = NULL;
1203         if(namefile != ""){     
1204                         nameMap = new NameAssignment(thisNamefile);
1205                         nameMap->readMap();
1206             cluster->readPhylipFile(thisDistFile, nameMap);
1207                 }else if (countfile != "") {
1208             ct = new CountTable();
1209             ct->readTable(thisNamefile, false, false);
1210             cluster->readPhylipFile(thisDistFile, ct);
1211         }
1212         tag = cluster->getTag();
1213         
1214                 if (m->control_pressed) { if(namefile != ""){   delete nameMap; }
1215             else { delete ct; } delete cluster; return 0; }
1216                 
1217                 list = cluster->getListVector();
1218                 rabund = cluster->getRAbundVector();
1219         
1220                 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1221                 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1222         listFileName = fileroot+ tag + ".list";
1223         
1224         ofstream listFile;
1225                 m->openOutputFile(fileroot+ tag + ".list",      listFile);
1226                 
1227                 float previousDist = 0.00000;
1228                 float rndPreviousDist = 0.00000;
1229                 oldList = *list;
1230                 
1231 #ifdef USE_MPI
1232         //output your files too
1233         if (pid != 0) {
1234             cout << endl << "Clustering " << thisDistFile << endl;
1235         }
1236 #endif
1237
1238         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1239         
1240                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1241                         if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  if(namefile != ""){    delete nameMap; }
1242                 else { delete ct; } return listFileName;  }
1243             
1244                         cluster->update(cutoff);
1245             
1246                         float dist = cluster->getSmallDist();
1247                         float rndDist;
1248                         if (hard) {
1249                                 rndDist = m->ceilDist(dist, precision); 
1250                         }else{
1251                                 rndDist = m->roundDist(dist, precision); 
1252                         }
1253             
1254             if(previousDist <= 0.0000 && dist != previousDist){
1255                 oldList.setLabel("unique");
1256                 oldList.print(listFile);
1257                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1258             }
1259             else if(rndDist != rndPreviousDist){
1260                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1261                 oldList.print(listFile);
1262                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1263             }
1264
1265             
1266                         previousDist = dist;
1267                         rndPreviousDist = rndDist;
1268                         oldList = *list;
1269                 }
1270         
1271                 if(previousDist <= 0.0000){
1272             oldList.setLabel("unique");
1273             oldList.print(listFile);
1274             if (labels.count("unique") == 0) { labels.insert("unique"); }
1275         }
1276         else if(rndPreviousDist<cutoff){
1277             oldList.setLabel(toString(rndPreviousDist,  length-1));
1278             oldList.print(listFile);
1279             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1280         }
1281
1282         
1283                 listFile.close();
1284                 
1285                 delete cluster;  delete list; delete rabund;
1286         if(namefile != ""){     delete nameMap; }
1287         else { delete ct; }
1288         
1289         if (deleteFiles) {
1290             m->mothurRemove(thisDistFile);
1291             m->mothurRemove(thisNamefile);
1292         }
1293         return listFileName;
1294         
1295         }
1296         catch(exception& e) {
1297                 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1298                 exit(1);
1299         }
1300 }
1301
1302 //**********************************************************************************************************************
1303 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1304         try {
1305         string listFileName = "";
1306         
1307         Cluster* cluster = NULL;
1308         SparseDistanceMatrix* matrix = NULL;
1309         ListVector* list = NULL;
1310         ListVector oldList;
1311         RAbundVector* rabund = NULL;
1312         
1313         if (m->control_pressed) { return listFileName; }
1314         
1315 #ifdef USE_MPI
1316         int pid;
1317         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1318         
1319         //output your files too
1320         if (pid != 0) {
1321             cout << endl << "Reading " << thisDistFile << endl;
1322         }
1323 #endif
1324         
1325         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1326         
1327         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
1328         read->setCutoff(cutoff);
1329         
1330         NameAssignment* nameMap = NULL;
1331         CountTable* ct = NULL;
1332                 if(namefile != ""){     
1333                         nameMap = new NameAssignment(thisNamefile);
1334                         nameMap->readMap();
1335             read->read(nameMap);
1336                 }else if (countfile != "") {
1337             ct = new CountTable();
1338             ct->readTable(thisNamefile, false, false);
1339             read->read(ct);
1340         }else { read->read(nameMap); }
1341                 
1342                 list = read->getListVector();
1343         oldList = *list;
1344                 matrix = read->getDMatrix();
1345         
1346                 if(countfile != "") {
1347             rabund = new RAbundVector();
1348             createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1349             delete ct;
1350         }else { rabund = new RAbundVector(list->getRAbundVector()); }
1351
1352         delete read;  read = NULL;
1353         if (namefile != "") { delete nameMap; nameMap = NULL; }
1354         
1355         
1356 #ifdef USE_MPI
1357         //output your files too
1358         if (pid != 0) {
1359             cout << endl << "Clustering " << thisDistFile << endl;
1360         }
1361 #endif
1362         
1363         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1364                 
1365         //create cluster
1366         float adjust = -1.0;
1367         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method, adjust); }
1368         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method, adjust); }
1369         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method, adjust);     }
1370         tag = cluster->getTag();
1371                 
1372         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1373         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1374         
1375         ofstream listFile;
1376         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1377                 
1378         listFileName = fileroot+ tag + ".list";
1379         
1380         float previousDist = 0.00000;
1381         float rndPreviousDist = 0.00000;
1382         
1383         oldList = *list;
1384         
1385         print_start = true;
1386         start = time(NULL);
1387         double saveCutoff = cutoff;
1388                 
1389         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1390             
1391             if (m->control_pressed) { //clean up
1392                 delete matrix; delete list;     delete cluster; delete rabund;
1393                 listFile.close();
1394                 m->mothurRemove(listFileName);  
1395                 return listFileName;
1396             }
1397             
1398             cluster->update(saveCutoff);
1399             
1400             float dist = matrix->getSmallDist();
1401             float rndDist;
1402             if (hard) {
1403                 rndDist = m->ceilDist(dist, precision); 
1404             }else{
1405                 rndDist = m->roundDist(dist, precision); 
1406             }
1407             
1408             if(previousDist <= 0.0000 && dist != previousDist){
1409                 oldList.setLabel("unique");
1410                 oldList.print(listFile);
1411                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1412             }
1413             else if(rndDist != rndPreviousDist){
1414                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1415                 oldList.print(listFile);
1416                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1417             }
1418             
1419             previousDist = dist;
1420             rndPreviousDist = rndDist;
1421             oldList = *list;
1422         }
1423         
1424                 
1425         if(previousDist <= 0.0000){
1426             oldList.setLabel("unique");
1427             oldList.print(listFile);
1428             if (labels.count("unique") == 0) { labels.insert("unique"); }
1429         }
1430         else if(rndPreviousDist<cutoff){
1431             oldList.setLabel(toString(rndPreviousDist,  length-1));
1432             oldList.print(listFile);
1433             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1434         }
1435         
1436         delete matrix; delete list;     delete cluster; delete rabund; 
1437         matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1438         listFile.close();
1439         
1440         if (m->control_pressed) { //clean up
1441             m->mothurRemove(listFileName);      
1442             return listFileName;
1443         }
1444         
1445         if (deleteFiles) {
1446             m->mothurRemove(thisDistFile);
1447             m->mothurRemove(thisNamefile);
1448         }
1449         
1450         if (saveCutoff != cutoff) { 
1451             if (hard)   {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1452             else                {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1453                         
1454             m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  
1455         }
1456         
1457         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1458         
1459         return listFileName;
1460         
1461         }
1462         catch(exception& e) {
1463                 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1464                 exit(1);
1465         }
1466 }
1467 //**********************************************************************************************************************
1468
1469 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1470         try{
1471                 
1472 #ifdef USE_MPI
1473                 int pid;
1474                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1475                 
1476                 if (pid != 0) {
1477 #endif
1478                 
1479                 string thisOutputDir = outputDir;
1480                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1481         map<string, string> variables; 
1482         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
1483                 string outputFileName = getOutputFileName("column", variables);
1484                 m->mothurRemove(outputFileName);
1485                 
1486                 
1487                 for (int i = 0; i < distNames.size(); i++) {
1488                         if (m->control_pressed) {  return 0; }
1489                         
1490                         string thisDistFile = distNames[i].begin()->first;
1491                         
1492                         m->appendFiles(thisDistFile, outputFileName);
1493                 }       
1494                         
1495                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1496                         
1497 #ifdef USE_MPI
1498                 }
1499 #endif
1500                                 
1501                 return 0;       
1502                 
1503                 
1504         }
1505         catch(exception& e) {
1506                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1507                 exit(1);
1508         }
1509 }
1510 //**********************************************************************************************************************
1511 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1512     try {
1513         rabund->setLabel(list->getLabel());        
1514         for(int i = 0; i < list->getNumBins(); i++) { 
1515             if (m->control_pressed) { break; }
1516             vector<string> binNames;
1517             string bin = list->get(i);
1518             m->splitAtComma(bin, binNames);
1519             int total = 0;
1520             for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
1521             rabund->push_back(total);   
1522         }
1523         return 0;
1524     }
1525     catch(exception& e) {
1526                 m->errorOut(e, "ClusterCommand", "createRabund");
1527                 exit(1);
1528         }
1529     
1530 }
1531 //**********************************************************************************************************************
1532 int ClusterSplitCommand::printFile(string singleton, vector< map<string, string> >& distName){
1533     try {
1534         ofstream out;
1535         map<string, string> variables;
1536         string thisOutputDir = outputDir;
1537                 if (outputDir == "") { thisOutputDir = m->hasPath(distfile); }
1538         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(distfile));
1539                 string outputFileName = getOutputFileName("file", variables);
1540         m->openOutputFile(outputFileName, out);
1541         
1542         out << singleton << endl;
1543         if (namefile != "") { out << "name" << endl; }
1544         else if (countfile != "") { out << "count" << endl; }
1545         else { out << "unknown" << endl; }
1546         
1547         for (int i = 0; i < distName.size(); i++) {    out << distName[i].begin()->first << '\t' << distName[i].begin()->second << endl;        }
1548         out.close();
1549         
1550         return 0;
1551     }
1552     catch(exception& e) {
1553                 m->errorOut(e, "ClusterCommand", "printFile");
1554                 exit(1);
1555         }
1556     
1557 }
1558 //**********************************************************************************************************************
1559 string ClusterSplitCommand::readFile(vector< map<string, string> >& distName){
1560     try {
1561         string singleton, thiscolumn, thisname, type;
1562         
1563         ifstream in;
1564         m->openInputFile(file, in);
1565         
1566         in >> singleton; m->gobble(in);
1567         in >> type; m->gobble(in);
1568         
1569         if (type == "name") { namefile = "name"; }
1570         else if (type == "count") { countfile = "count"; }
1571         else {  m->mothurOut("[ERROR]: unknown file type. Are the files in column 2 of the file name files or count files? Please change unknown to name or count.\n"); m->control_pressed = true; }
1572         
1573         
1574         while(!in.eof()) {
1575             if (m->control_pressed) { break; }
1576             
1577             in >> thiscolumn; m->gobble(in);
1578             in >> thisname; m->gobble(in);
1579             
1580             map<string, string> temp;
1581             temp[thiscolumn] = thisname;
1582             distName.push_back(temp);
1583         }
1584         in.close();
1585         
1586         return singleton;
1587     }
1588     catch(exception& e) {
1589                 m->errorOut(e, "ClusterCommand", "readFile");
1590                 exit(1);
1591         }
1592     
1593 }
1594 //**********************************************************************************************************************