]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
added topdown parameter to pre.cluster. added more debugging output to bayesian...
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11
12
13 //**********************************************************************************************************************
14 vector<string> ClusterSplitCommand::setParameters(){    
15         try {
16                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
17                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
18                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
19                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
20         CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
21                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
22                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
23                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "","",false,false,true); parameters.push_back(psplitmethod);
24                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
25                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
26         CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
27                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
28                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
29                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff);
30                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
31                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false); parameters.push_back(pmethod);
32                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
33         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
34                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
35                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
36                         
37                 vector<string> myArray;
38                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
39                 return myArray;
40         }
41         catch(exception& e) {
42                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
43                 exit(1);
44         }
45 }
46 //**********************************************************************************************************************
47 string ClusterSplitCommand::getHelpString(){    
48         try {
49                 string helpString = "";
50                 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
51                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
52                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
53                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
54                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
55                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
56                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
57                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
58                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
59                 helpString += "The name parameter allows you to enter your name file. \n";
60         helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
61         helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
62                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
63                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
64                 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
65                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
66                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
67                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
68                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
69         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic.  It is only valid with splitmethod=fasta. Default=f.\n";
70 #ifdef USE_MPI
71                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
72 #endif
73                 helpString += "The cluster.split command should be in the following format: \n";
74                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
75                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";       
76                 return helpString;
77         }
78         catch(exception& e) {
79                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
80                 exit(1);
81         }
82 }
83 //**********************************************************************************************************************
84 string ClusterSplitCommand::getOutputPattern(string type) {
85     try {
86         string pattern = "";
87         
88         if (type == "list") {  pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; } 
89         else if (type == "rabund") {  pattern = "[filename],[clustertag],rabund"; } 
90         else if (type == "sabund") {  pattern = "[filename],[clustertag],sabund"; }
91         else if (type == "column") {  pattern = "[filename],dist"; }
92         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
93         
94         return pattern;
95     }
96     catch(exception& e) {
97         m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
98         exit(1);
99     }
100 }
101 //**********************************************************************************************************************
102 ClusterSplitCommand::ClusterSplitCommand(){     
103         try {
104                 abort = true; calledHelp = true; 
105                 setParameters();
106                 vector<string> tempOutNames;
107                 outputTypes["list"] = tempOutNames;
108                 outputTypes["rabund"] = tempOutNames;
109                 outputTypes["sabund"] = tempOutNames;
110                 outputTypes["column"] = tempOutNames;
111         }
112         catch(exception& e) {
113                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
114                 exit(1);
115         }
116 }
117 //**********************************************************************************************************************
118 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
119 ClusterSplitCommand::ClusterSplitCommand(string option)  {
120         try{
121                 abort = false; calledHelp = false;   
122                 format = "";
123                 
124                 //allow user to run help
125                 if(option == "help") { help(); abort = true; calledHelp = true; }
126                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
127                 
128                 else {
129                         vector<string> myArray = setParameters();
130                         
131                         OptionParser parser(option);
132                         map<string,string> parameters = parser.getParameters();
133                         
134                         ValidParameters validParameter("cluster.split");
135                 
136                         //check to make sure all parameters are valid for command
137                         map<string,string>::iterator it;
138                         for (it = parameters.begin(); it != parameters.end(); it++) { 
139                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
140                                         abort = true;
141                                 }
142                         }
143                         
144                         //initialize outputTypes
145                         vector<string> tempOutNames;
146                         outputTypes["list"] = tempOutNames;
147                         outputTypes["rabund"] = tempOutNames;
148                         outputTypes["sabund"] = tempOutNames;
149                         outputTypes["column"] = tempOutNames;
150                         
151                         //if the user changes the output directory command factory will send this info to us in the output parameter 
152                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
153                         
154                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
155                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
156                         if (inputDir == "not found"){   inputDir = "";          }
157                         else {
158                                 string path;
159                                 it = parameters.find("phylip");
160                                 //user has given a template file
161                                 if(it != parameters.end()){ 
162                                         path = m->hasPath(it->second);
163                                         //if the user has not given a path then, add inputdir. else leave path alone.
164                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
165                                 }
166                                 
167                                 it = parameters.find("column");
168                                 //user has given a template file
169                                 if(it != parameters.end()){ 
170                                         path = m->hasPath(it->second);
171                                         //if the user has not given a path then, add inputdir. else leave path alone.
172                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
173                                 }
174                                 
175                                 it = parameters.find("name");
176                                 //user has given a template file
177                                 if(it != parameters.end()){ 
178                                         path = m->hasPath(it->second);
179                                         //if the user has not given a path then, add inputdir. else leave path alone.
180                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
181                                 }
182                                 
183                                 it = parameters.find("taxonomy");
184                                 //user has given a template file
185                                 if(it != parameters.end()){ 
186                                         path = m->hasPath(it->second);
187                                         //if the user has not given a path then, add inputdir. else leave path alone.
188                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
189                                 }
190                                 
191                                 it = parameters.find("fasta");
192                                 //user has given a template file
193                                 if(it != parameters.end()){ 
194                                         path = m->hasPath(it->second);
195                                         //if the user has not given a path then, add inputdir. else leave path alone.
196                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
197                                 }
198                 
199                 it = parameters.find("count");
200                                 //user has given a template file
201                                 if(it != parameters.end()){ 
202                                         path = m->hasPath(it->second);
203                                         //if the user has not given a path then, add inputdir. else leave path alone.
204                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
205                                 }
206                         }
207                         
208                         //check for required parameters
209                         phylipfile = validParameter.validFile(parameters, "phylip", true);
210                         if (phylipfile == "not open") { abort = true; }
211                         else if (phylipfile == "not found") { phylipfile = ""; }        
212                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
213                         
214                         columnfile = validParameter.validFile(parameters, "column", true);
215                         if (columnfile == "not open") { abort = true; } 
216                         else if (columnfile == "not found") { columnfile = ""; }
217                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
218                         
219                         namefile = validParameter.validFile(parameters, "name", true);
220                         if (namefile == "not open") { abort = true; namefile = "";}     
221                         else if (namefile == "not found") { namefile = "";  }
222                         else { m->setNameFile(namefile); }
223             
224             countfile = validParameter.validFile(parameters, "count", true);
225                         if (countfile == "not open") { abort = true; countfile = "";}   
226                         else if (countfile == "not found") { countfile = "";  }
227                         else { m->setCountTableFile(countfile); }
228                         
229                         fastafile = validParameter.validFile(parameters, "fasta", true);
230                         if (fastafile == "not open") { abort = true; }  
231                         else if (fastafile == "not found") { fastafile = ""; }
232                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
233                         
234                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
235                         if (taxFile == "not open") { taxFile = ""; abort = true; }      
236                         else if (taxFile == "not found") { taxFile = ""; }
237                         else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
238                         
239                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { 
240                                 //is there are current file available for either of these?
241                                 //give priority to column, then phylip, then fasta
242                                 columnfile = m->getColumnFile(); 
243                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
244                                 else { 
245                                         phylipfile = m->getPhylipFile(); 
246                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
247                                         else { 
248                                                 fastafile = m->getFastaFile(); 
249                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
250                                                 else { 
251                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); 
252                                                         abort = true; 
253                                                 }
254                                         }
255                                 }
256                         }
257                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
258             
259             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
260             
261                         if (columnfile != "") {
262                                 if ((namefile == "") && (countfile == "")) { 
263                                         namefile = m->getNameFile(); 
264                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
265                                         else { 
266                                                 countfile = m->getCountTableFile();
267                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
268                         else { 
269                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine(); 
270                             abort = true; 
271                         }       
272                                         }       
273                                 }
274                         }
275                         
276                         if (fastafile != "") {
277                                 if (taxFile == "") { 
278                                         taxFile = m->getTaxonomyFile(); 
279                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
280                                         else { 
281                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine(); 
282                                                 abort = true; 
283                                         }       
284                                 }
285                                 
286                                 if ((namefile == "") && (countfile == "")) { 
287                                         namefile = m->getNameFile(); 
288                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
289                                         else { 
290                                                 countfile = m->getCountTableFile();
291                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
292                         else { 
293                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine(); 
294                             abort = true; 
295                         }       
296                                         }       
297                                 }
298                         }
299                                         
300                         //check for optional parameter and set defaults
301                         // ...at some point should added some additional type checking...
302                         //get user cutoff and precision or use defaults
303                         string temp;
304                         temp = validParameter.validFile(parameters, "precision", false);
305                         if (temp == "not found") { temp = "100"; }
306                         //saves precision legnth for formatting below
307                         length = temp.length();
308                         m->mothurConvert(temp, precision); 
309                         
310                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
311                         hard = m->isTrue(temp);
312                         
313                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
314                         large = m->isTrue(temp);
315             
316                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
317                         m->setProcessors(temp);
318                         m->mothurConvert(temp, processors);
319                         
320                         temp = validParameter.validFile(parameters, "splitmethod", false);      
321                         if ((splitmethod != "fasta") && (splitmethod != "classify")) {
322                                 if (temp == "not found")  { splitmethod = "distance"; }
323                                 else {  splitmethod = temp; }
324                         }
325                         
326             temp = validParameter.validFile(parameters, "classic", false);                      if (temp == "not found") { temp = "F"; }
327                         classic = m->isTrue(temp);
328             
329             if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
330
331                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
332                         m->mothurConvert(temp, cutoff); 
333                         cutoff += (5 / (precision * 10.0));  
334                         
335                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
336                         m->mothurConvert(temp, taxLevelCutoff); 
337                         
338                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
339                         
340                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
341                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
342                         
343                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
344                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
345                         
346                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
347
348                         showabund = validParameter.validFile(parameters, "showabund", false);
349                         if (showabund == "not found") { showabund = "T"; }
350             
351             temp = validParameter.validFile(parameters, "cluster", false);  if (temp == "not found") { temp = "T"; }
352             runCluster = m->isTrue(temp);
353
354                         timing = validParameter.validFile(parameters, "timing", false);
355                         if (timing == "not found") { timing = "F"; }
356                         
357                 }
358         }
359         catch(exception& e) {
360                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
361                 exit(1);
362         }
363 }
364
365 //**********************************************************************************************************************
366
367 int ClusterSplitCommand::execute(){
368         try {
369         
370                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
371                 
372                 time_t estart;
373                 vector<string> listFileNames;
374                 set<string> labels;
375                 string singletonName = "";
376                 double saveCutoff = cutoff;
377
378                 //****************** file prep work ******************************//
379                 #ifdef USE_MPI
380                         int pid;
381                         int tag = 2001;
382                         MPI_Status status; 
383                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
384                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
385                         
386                         if (pid == 0) { //only process 0 converts and splits
387                         
388                 #endif
389                 
390                 //if user gave a phylip file convert to column file
391                 if (format == "phylip") {
392                         estart = time(NULL);
393                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
394                         
395                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
396                         
397                         NameAssignment* nameMap = NULL;
398                         convert->setFormat("phylip");
399                         convert->read(nameMap);
400                         
401                         if (m->control_pressed) {  delete convert;  return 0;  }
402                         
403                         distfile = convert->getOutputFile();
404                 
405                         //if no names file given with phylip file, create it
406                         ListVector* listToMakeNameFile =  convert->getListVector();
407                         if ((namefile == "") && (countfile == "")) {  //you need to make a namefile for split matrix
408                                 ofstream out;
409                                 namefile = phylipfile + ".names";
410                                 m->openOutputFile(namefile, out);
411                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
412                                         string bin = listToMakeNameFile->get(i);
413                                         out << bin << '\t' << bin << endl;
414                                 }
415                                 out.close();
416                         }
417                         delete listToMakeNameFile;
418                         delete convert;
419                         
420                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
421                 }
422                 if (m->control_pressed) { return 0; }
423                 
424                 estart = time(NULL);
425                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
426                 
427                 //split matrix into non-overlapping groups
428                 SplitMatrix* split;
429                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large);                                                    }
430                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large);                                    }
431                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);  }
432                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
433                 
434                 split->split();
435                 
436                 if (m->control_pressed) { delete split; return 0; }
437                 
438                 singletonName = split->getSingletonNames();
439                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
440                 delete split;
441                 
442                 //output a merged distance file
443                 if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
444                         
445                                 
446                 if (m->control_pressed) { return 0; }
447                 
448                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
449                 estart = time(NULL);
450               
451         if (!runCluster) { 
452
453                 m->mothurOutEndLine();
454                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
455                 for (int i = 0; i < distName.size(); i++) {     m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine();      }
456                 m->mothurOutEndLine();
457                 return 0;
458                 
459         }
460    
461                 //****************** break up files between processes and cluster each file set ******************************//
462         #ifdef USE_MPI
463                         ////you are process 0 from above////
464                         
465                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...                           
466                         dividedNames.resize(processors);
467                                         
468                         //for each file group figure out which process will complete it
469                         //want to divide the load intelligently so the big files are spread between processes
470                         for (int i = 0; i < distName.size(); i++) { 
471                                 int processToAssign = (i+1) % processors; 
472                                 if (processToAssign == 0) { processToAssign = processors; }
473                                                 
474                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
475                         }
476                                         
477                         //not lets reverse the order of ever other process, so we balance big files running with little ones
478                         for (int i = 0; i < processors; i++) {
479                                 int remainder = ((i+1) % processors);
480                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
481                         }
482                         
483                         
484                         //send each child the list of files it needs to process
485                         for(int i = 1; i < processors; i++) { 
486                                 //send number of file pairs
487                                 int num = dividedNames[i].size();
488                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
489                                 
490                                 for (int j = 0; j < num; j++) { //send filenames to process i
491                                         char tempDistFileName[1024];
492                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
493                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
494                                         
495                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
496                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
497                                         
498                                         char tempNameFileName[1024];
499                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
500                                         int lengthName = (dividedNames[i][j].begin()->second).length();
501
502                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
503                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
504                                 }
505                         }
506                         
507                         //process your share
508                         listFileNames = cluster(dividedNames[0], labels);
509                         
510                         //receive the other processes info
511                         for(int i = 1; i < processors; i++) { 
512                                 int num = dividedNames[i].size();
513                                 
514                                 double tempCutoff;
515                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
516                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
517                                 
518                                 //send list filenames to root process
519                                 for (int j = 0; j < num; j++) {  
520                                         int lengthList = 0;
521                                         char tempListFileName[1024];
522                                 
523                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
524                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
525                                 
526                                         string myListFileName = tempListFileName;
527                                         myListFileName = myListFileName.substr(0, lengthList);
528                                         
529                                         listFileNames.push_back(myListFileName);
530                                 }
531                                 
532                                 //send Labels to root process
533                                 int numLabels = 0;
534                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
535                                 
536                                 for (int j = 0; j < numLabels; j++) {  
537                                         int lengthLabel = 0;
538                                         char tempLabel[100];
539                                 
540                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
541                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
542                                 
543                                         string myLabel = tempLabel;
544                                         myLabel = myLabel.substr(0, lengthLabel);
545                         
546                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
547                                 }
548                         }
549                         
550                 }else { //you are a child process
551                         vector < map<string, string> >  myNames;
552                         
553                         //recieve the files you need to process
554                         //receive number of file pairs
555                         int num = 0;
556                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
557                         
558                         myNames.resize(num);
559         
560                         for (int j = 0; j < num; j++) { //receive filenames to process 
561                                 int lengthDist = 0;
562                                 char tempDistFileName[1024];
563                                 
564                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
565                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
566                                 
567                                 string myDistFileName = tempDistFileName;
568                                 myDistFileName = myDistFileName.substr(0, lengthDist);
569                         
570                                 int lengthName = 0;
571                                 char tempNameFileName[1024];
572                                 
573                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
574                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
575                                 
576                                 string myNameFileName = tempNameFileName;
577                                 myNameFileName = myNameFileName.substr(0, lengthName);
578                                 
579                                 //save file name
580                                 myNames[j][myDistFileName] = myNameFileName;
581                         }
582         
583                         //process them
584                         listFileNames = cluster(myNames, labels);
585                         
586                         //send cutoff
587                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
588                         
589                         //send list filenames to root process
590                         for (int j = 0; j < num; j++) {  
591                                 char tempListFileName[1024];
592                                 strcpy(tempListFileName, listFileNames[j].c_str());
593                                 int lengthList = listFileNames[j].length();
594                                         
595                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
596                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
597                         }
598                         
599                         //send Labels to root process
600                         int numLabels = labels.size();
601                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
602                         
603                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
604                                 char tempLabel[100];
605                                 strcpy(tempLabel, (*it).c_str());
606                                 int lengthLabel = (*it).length();
607                                         
608                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
609                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
610                         }
611                 }
612                 
613                 //make everyone wait
614                 MPI_Barrier(MPI_COMM_WORLD);
615                 
616         #else
617                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
618                 //sanity check
619                 if (processors > distName.size()) { processors = distName.size(); }
620                 
621                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
622                                 if(processors == 1){
623                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
624                                 }else{
625                                         listFileNames = createProcesses(distName, labels);
626                 }
627                 #else
628                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
629                 #endif
630         #endif  
631                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
632                 
633                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
634                 
635                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
636                 
637                 //****************** merge list file and create rabund and sabund files ******************************//
638                 estart = time(NULL);
639                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
640                 
641                 #ifdef USE_MPI
642                         if (pid == 0) { //only process 0 merges
643                 #endif
644
645                 ListVector* listSingle;
646                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
647                 
648                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
649                 
650                 mergeLists(listFileNames, labelBins, listSingle);
651
652                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
653                 
654                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
655                 
656                 //set list file as new current listfile
657                 string current = "";
658                 itTypes = outputTypes.find("list");
659                 if (itTypes != outputTypes.end()) {
660                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
661                 }
662                 
663                 //set rabund file as new current rabundfile
664                 itTypes = outputTypes.find("rabund");
665                 if (itTypes != outputTypes.end()) {
666                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
667                 }
668                 
669                 //set sabund file as new current sabundfile
670                 itTypes = outputTypes.find("sabund");
671                 if (itTypes != outputTypes.end()) {
672                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
673                 }
674                                 
675                 m->mothurOutEndLine();
676                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
677                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
678                 m->mothurOutEndLine();
679                 
680                 #ifdef USE_MPI
681                         } //only process 0 merges
682                         
683                         //make everyone wait
684                         MPI_Barrier(MPI_COMM_WORLD);
685                 #endif
686
687                 return 0;
688         }
689         catch(exception& e) {
690                 m->errorOut(e, "ClusterSplitCommand", "execute");
691                 exit(1);
692         }
693 }
694 //**********************************************************************************************************************
695 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
696         try {
697                                 
698                 map<float, int> labelBin;
699                 vector<float> orderFloat;
700                 int numSingleBins;
701                 
702                 //read in singletons
703                 if (singleton != "none") {
704             
705             ifstream in;
706             m->openInputFile(singleton, in);
707                                 
708                         string firstCol, secondCol;
709                         listSingle = new ListVector();
710             
711             if (countfile != "") { m->getline(in); m->gobble(in); }
712             
713                         while (!in.eof()) {
714                                 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
715                                 if (countfile == "") { listSingle->push_back(secondCol); }
716                 else { listSingle->push_back(firstCol); }
717                         }
718             
719                         in.close();
720                         m->mothurRemove(singleton);
721                         
722                         numSingleBins = listSingle->getNumBins();
723                 }else{  listSingle = NULL; numSingleBins = 0;  }
724                 
725                 //go through users set and make them floats so we can sort them 
726                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
727                         float temp = -10.0;
728
729                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
730                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
731                         
732                         if (temp <= cutoff) {
733                                 orderFloat.push_back(temp);
734                                 labelBin[temp] = numSingleBins; //initialize numbins 
735                         }
736                 }
737         
738                 //sort order
739                 sort(orderFloat.begin(), orderFloat.end());
740                 userLabels.clear();
741                         
742                 //get the list info from each file
743                 for (int k = 0; k < listNames.size(); k++) {
744         
745                         if (m->control_pressed) {  
746                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
747                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
748                                 return labelBin;
749                         }
750                         
751                         InputData* input = new InputData(listNames[k], "list");
752                         ListVector* list = input->getListVector();
753                         string lastLabel = list->getLabel();
754                         
755                         string filledInList = listNames[k] + "filledInTemp";
756                         ofstream outFilled;
757                         m->openOutputFile(filledInList, outFilled);
758         
759                         //for each label needed
760                         for(int l = 0; l < orderFloat.size(); l++){
761                         
762                                 string thisLabel;
763                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
764                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
765
766                                 //this file has reached the end
767                                 if (list == NULL) { 
768                                         list = input->getListVector(lastLabel, true); 
769                                 }else{  //do you have the distance, or do you need to fill in
770                                                 
771                                         float labelFloat;
772                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
773                                         else { convert(list->getLabel(), labelFloat); }
774
775                                         //check for missing labels
776                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
777                                                 //if its bigger get last label, otherwise keep it
778                                                 delete list;
779                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
780                                         }
781                                         lastLabel = list->getLabel();
782                                 }
783                                 
784                                 //print to new file
785                                 list->setLabel(thisLabel);
786                                 list->print(outFilled);
787                 
788                                 //update labelBin
789                                 labelBin[orderFloat[l]] += list->getNumBins();
790                                                                         
791                                 delete list;
792                                                                         
793                                 list = input->getListVector();
794                         }
795                         
796                         if (list != NULL) { delete list; }
797                         delete input;
798                         
799                         outFilled.close();
800                         m->mothurRemove(listNames[k]);
801                         rename(filledInList.c_str(), listNames[k].c_str());
802                 }
803                 
804                 return labelBin;
805         }
806         catch(exception& e) {
807                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
808                 exit(1);
809         }
810 }
811 //**********************************************************************************************************************
812 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
813         try {
814                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
815                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
816                 
817         map<string, string> variables; 
818         variables["[filename]"] = fileroot;
819         variables["[clustertag]"] = tag;
820         string sabundFileName = getOutputFileName("sabund", variables);
821         string rabundFileName = getOutputFileName("rabund", variables);
822         if (countfile != "") { variables["[tag2]"] = "unique_list"; }
823         string listFileName = getOutputFileName("list", variables);
824         
825         if (countfile == "") {
826             m->openOutputFile(sabundFileName,   outSabund);
827             m->openOutputFile(rabundFileName,   outRabund);
828             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
829             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
830             
831         }
832                 m->openOutputFile(listFileName, outList);
833         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
834                 
835                 
836                 map<float, int>::iterator itLabel;
837
838                 //for each label needed
839                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
840                         
841                         string thisLabel;
842                         if (itLabel->first == -1) { thisLabel = "unique"; }
843                         else { thisLabel = toString(itLabel->first,  length-1);  } 
844                         
845                         outList << thisLabel << '\t' << itLabel->second << '\t';
846             
847             RAbundVector* rabund = NULL;
848             if (countfile == "") {
849                 rabund = new RAbundVector();
850                 rabund->setLabel(thisLabel);
851             }
852
853                         //add in singletons
854                         if (listSingle != NULL) {
855                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
856                                         outList << listSingle->get(j) << '\t';
857                                         if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
858                                 }
859                         }
860                         
861                         //get the list info from each file
862                         for (int k = 0; k < listNames.size(); k++) {
863         
864                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } if (rabund != NULL) { delete rabund; } return 0; }
865                                 
866                                 InputData* input = new InputData(listNames[k], "list");
867                                 ListVector* list = input->getListVector(thisLabel);
868                                 
869                                 //this file has reached the end
870                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
871                                 else {          
872                                         for (int j = 0; j < list->getNumBins(); j++) {
873                                                 outList << list->get(j) << '\t';
874                                                 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
875                                         }
876                                         delete list;
877                                 }
878                                 delete input;
879                         }
880                         
881             if (countfile == "") {
882                 SAbundVector sabund = rabund->getSAbundVector();
883                 sabund.print(outSabund);
884                 rabund->print(outRabund);
885             }
886                         outList << endl;
887                         
888                         if (rabund != NULL) { delete rabund; }
889                 }
890                 
891                 outList.close();
892         if (countfile == "") {
893             outRabund.close();
894             outSabund.close();
895                 }
896                 if (listSingle != NULL) { delete listSingle;  }
897                 
898                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
899                 
900                 return 0;
901         }
902         catch(exception& e) {
903                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
904                 exit(1);
905         }
906 }
907
908 //**********************************************************************************************************************
909
910 void ClusterSplitCommand::printData(ListVector* oldList){
911         try {
912                 string label = oldList->getLabel();
913                 RAbundVector oldRAbund = oldList->getRAbundVector();
914                 
915                 oldRAbund.setLabel(label);
916                 if (m->isTrue(showabund)) {
917                         oldRAbund.getSAbundVector().print(cout);
918                 }
919                 oldRAbund.print(outRabund);
920                 oldRAbund.getSAbundVector().print(outSabund);
921         
922                 oldList->print(outList);
923         }
924         catch(exception& e) {
925                 m->errorOut(e, "ClusterSplitCommand", "printData");
926                 exit(1);
927         }
928 }
929 //**********************************************************************************************************************
930 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
931         try {
932         
933         vector<string> listFiles;
934         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
935         dividedNames.resize(processors);
936         
937         //for each file group figure out which process will complete it
938         //want to divide the load intelligently so the big files are spread between processes
939         for (int i = 0; i < distName.size(); i++) { 
940             //cout << i << endl;
941             int processToAssign = (i+1) % processors; 
942             if (processToAssign == 0) { processToAssign = processors; }
943             
944             dividedNames[(processToAssign-1)].push_back(distName[i]);
945             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
946         }
947         
948         //not lets reverse the order of ever other process, so we balance big files running with little ones
949         for (int i = 0; i < processors; i++) {
950             //cout << i << endl;
951             int remainder = ((i+1) % processors);
952             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
953         }
954         
955         if (m->control_pressed) { return listFiles; }
956         
957         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
958                 int process = 1;
959                 processIDS.clear();
960                 
961                 //loop through and create all the processes you want
962                 while (process != processors) {
963                         int pid = fork();
964                         
965                         if (pid > 0) {
966                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
967                                 process++;
968                         }else if (pid == 0){
969                                 set<string> labels;
970                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
971                                 
972                                 //write out names to file
973                                 string filename = toString(getpid()) + ".temp";
974                                 ofstream out;
975                                 m->openOutputFile(filename, out);
976                                 out << tag << endl;
977                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
978                                 out.close();
979                                 
980                                 //print out labels
981                                 ofstream outLabels;
982                                 filename = toString(getpid()) + ".temp.labels";
983                                 m->openOutputFile(filename, outLabels);
984                                 
985                                 outLabels << cutoff << endl;
986                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
987                                         outLabels << (*it) << endl;
988                                 }
989                                 outLabels.close();
990
991                                 exit(0);
992                         }else { 
993                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
994                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
995                                 exit(0);
996                         }
997                 }
998                 
999         //do your part
1000         listFiles = cluster(dividedNames[0], labels);
1001         
1002                 //force parent to wait until all the processes are done
1003                 for (int i=0;i< processIDS.size();i++) { 
1004                         int temp = processIDS[i];
1005                         wait(&temp);
1006                 }
1007         
1008         //get list of list file names from each process
1009         for(int i=0;i<processIDS.size();i++){
1010             string filename = toString(processIDS[i]) + ".temp";
1011             ifstream in;
1012             m->openInputFile(filename, in);
1013             
1014             in >> tag; m->gobble(in);
1015             
1016             while(!in.eof()) {
1017                 string tempName;
1018                 in >> tempName; m->gobble(in);
1019                 listFiles.push_back(tempName);
1020             }
1021             in.close();
1022             m->mothurRemove((toString(processIDS[i]) + ".temp"));
1023             
1024             //get labels
1025             filename = toString(processIDS[i]) + ".temp.labels";
1026             ifstream in2;
1027             m->openInputFile(filename, in2);
1028             
1029             float tempCutoff;
1030             in2 >> tempCutoff; m->gobble(in2);
1031             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1032             
1033             while(!in2.eof()) {
1034                 string tempName;
1035                 in2 >> tempName; m->gobble(in2);
1036                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1037             }
1038             in2.close();
1039             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1040         }
1041         
1042
1043     #else
1044        
1045         //////////////////////////////////////////////////////////////////////////////////////////////////////
1046                 //Windows version shared memory, so be careful when passing variables through the clusterData struct. 
1047                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1048                 //Taking advantage of shared memory to allow both threads to add labels.
1049                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1050                 /*
1051                 vector<clusterData*> pDataArray; 
1052                 DWORD   dwThreadIdArray[processors-1];
1053                 HANDLE  hThreadArray[processors-1]; 
1054                 
1055                 //Create processor worker threads.
1056                 for( int i=1; i<processors; i++ ){
1057                         // Allocate memory for thread data.
1058                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1059                         pDataArray.push_back(tempCluster);
1060                         processIDS.push_back(i);
1061             
1062                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1063                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1064                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);  
1065             
1066                 }
1067         
1068         //do your part
1069         listFiles = cluster(dividedNames[0], labels);
1070         
1071                 //Wait until all threads have terminated.
1072                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1073                 
1074                 //Close all thread handles and free memory allocations.
1075                 for(int i=0; i < pDataArray.size(); i++){
1076             //get tag
1077             tag = pDataArray[i]->tag;
1078             //get listfiles created
1079             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1080             //get labels
1081             set<string>::iterator it;
1082             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1083                         //check cutoff
1084             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1085                         CloseHandle(hThreadArray[i]);
1086                         delete pDataArray[i];
1087                 }
1088 */
1089         #endif          
1090         
1091         return listFiles;
1092         
1093         }
1094         catch(exception& e) {
1095                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1096                 exit(1);
1097         }
1098 }
1099 //**********************************************************************************************************************
1100
1101 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1102         try {
1103                 
1104                 vector<string> listFileNames;
1105                 double smallestCutoff = cutoff;
1106                 
1107                 //cluster each distance file
1108                 for (int i = 0; i < distNames.size(); i++) {
1109             
1110                         string thisNamefile = distNames[i].begin()->second;
1111                         string thisDistFile = distNames[i].begin()->first;
1112                         
1113                         string listFileName = "";
1114             if (classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff);   }
1115             else            {  listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff);          }
1116
1117                         if (m->control_pressed) { //clean up
1118                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1119                                 listFileNames.clear(); return listFileNames;
1120                         }
1121             
1122             listFileNames.push_back(listFileName);
1123         }
1124                 
1125                 cutoff = smallestCutoff;
1126                                         
1127                 return listFileNames;
1128         
1129         }
1130         catch(exception& e) {
1131                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1132                 exit(1);
1133         }
1134
1135
1136 }
1137 //**********************************************************************************************************************
1138 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1139         try {
1140         string listFileName = "";
1141         
1142         ListVector* list = NULL;
1143         ListVector oldList;
1144         RAbundVector* rabund = NULL;
1145         
1146 #ifdef USE_MPI
1147         int pid;
1148         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1149         
1150         //output your files too
1151         if (pid != 0) {
1152             cout << endl << "Reading " << thisDistFile << endl;
1153         }
1154 #endif
1155
1156         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1157         
1158         //reads phylip file storing data in 2D vector, also fills list and rabund
1159         bool sim = false;
1160                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1161         
1162         NameAssignment* nameMap = NULL;
1163         CountTable* ct = NULL;
1164         if(namefile != ""){     
1165                         nameMap = new NameAssignment(thisNamefile);
1166                         nameMap->readMap();
1167             cluster->readPhylipFile(thisDistFile, nameMap);
1168                 }else if (countfile != "") {
1169             ct = new CountTable();
1170             ct->readTable(thisNamefile);
1171             cluster->readPhylipFile(thisDistFile, ct);
1172         }
1173         tag = cluster->getTag();
1174         
1175                 if (m->control_pressed) { if(namefile != ""){   delete nameMap; }
1176             else { delete ct; } delete cluster; return 0; }
1177                 
1178                 list = cluster->getListVector();
1179                 rabund = cluster->getRAbundVector();
1180         
1181                 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1182                 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1183         listFileName = fileroot+ tag + ".list";
1184         
1185         ofstream listFile;
1186                 m->openOutputFile(fileroot+ tag + ".list",      listFile);
1187                 
1188                 float previousDist = 0.00000;
1189                 float rndPreviousDist = 0.00000;
1190                 oldList = *list;
1191                 
1192 #ifdef USE_MPI
1193         //output your files too
1194         if (pid != 0) {
1195             cout << endl << "Clustering " << thisDistFile << endl;
1196         }
1197 #endif
1198
1199         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1200         
1201                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1202                         if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  if(namefile != ""){    delete nameMap; }
1203                 else { delete ct; } return listFileName;  }
1204             
1205                         cluster->update(cutoff);
1206             
1207                         float dist = cluster->getSmallDist();
1208                         float rndDist;
1209                         if (hard) {
1210                                 rndDist = m->ceilDist(dist, precision); 
1211                         }else{
1212                                 rndDist = m->roundDist(dist, precision); 
1213                         }
1214             
1215             if(previousDist <= 0.0000 && dist != previousDist){
1216                 oldList.setLabel("unique");
1217                 oldList.print(listFile);
1218                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1219             }
1220             else if(rndDist != rndPreviousDist){
1221                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1222                 oldList.print(listFile);
1223                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1224             }
1225
1226             
1227                         previousDist = dist;
1228                         rndPreviousDist = rndDist;
1229                         oldList = *list;
1230                 }
1231         
1232                 if(previousDist <= 0.0000){
1233             oldList.setLabel("unique");
1234             oldList.print(listFile);
1235             if (labels.count("unique") == 0) { labels.insert("unique"); }
1236         }
1237         else if(rndPreviousDist<cutoff){
1238             oldList.setLabel(toString(rndPreviousDist,  length-1));
1239             oldList.print(listFile);
1240             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1241         }
1242
1243         
1244                 listFile.close();
1245                 
1246                 delete cluster;  delete list; delete rabund;
1247         if(namefile != ""){     delete nameMap; }
1248         else { delete ct; }
1249         
1250         m->mothurRemove(thisDistFile);
1251         m->mothurRemove(thisNamefile);
1252         
1253         return listFileName;
1254         
1255         }
1256         catch(exception& e) {
1257                 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1258                 exit(1);
1259         }
1260 }
1261
1262 //**********************************************************************************************************************
1263 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1264         try {
1265         string listFileName = "";
1266         
1267         Cluster* cluster = NULL;
1268         SparseDistanceMatrix* matrix = NULL;
1269         ListVector* list = NULL;
1270         ListVector oldList;
1271         RAbundVector* rabund = NULL;
1272         
1273         if (m->control_pressed) { return listFileName; }
1274         
1275 #ifdef USE_MPI
1276         int pid;
1277         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1278         
1279         //output your files too
1280         if (pid != 0) {
1281             cout << endl << "Reading " << thisDistFile << endl;
1282         }
1283 #endif
1284         
1285         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1286         
1287         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
1288         read->setCutoff(cutoff);
1289         
1290         NameAssignment* nameMap = NULL;
1291         CountTable* ct = NULL;
1292                 if(namefile != ""){     
1293                         nameMap = new NameAssignment(thisNamefile);
1294                         nameMap->readMap();
1295             read->read(nameMap);
1296                 }else if (countfile != "") {
1297             ct = new CountTable();
1298             ct->readTable(thisNamefile);
1299             read->read(ct);
1300         }else { read->read(nameMap); }
1301                 
1302                 list = read->getListVector();
1303         oldList = *list;
1304                 matrix = read->getDMatrix();
1305         
1306                 if(countfile != "") {
1307             rabund = new RAbundVector();
1308             createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1309             delete ct;
1310         }else { rabund = new RAbundVector(list->getRAbundVector()); }
1311
1312         delete read;  read = NULL;
1313         if (namefile != "") { delete nameMap; nameMap = NULL; }
1314         
1315         
1316 #ifdef USE_MPI
1317         //output your files too
1318         if (pid != 0) {
1319             cout << endl << "Clustering " << thisDistFile << endl;
1320         }
1321 #endif
1322         
1323         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1324                 
1325         //create cluster
1326         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1327         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1328         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
1329         tag = cluster->getTag();
1330                 
1331         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1332         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1333         
1334         ofstream listFile;
1335         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1336                 
1337         listFileName = fileroot+ tag + ".list";
1338         
1339         float previousDist = 0.00000;
1340         float rndPreviousDist = 0.00000;
1341         
1342         oldList = *list;
1343         
1344         print_start = true;
1345         start = time(NULL);
1346         double saveCutoff = cutoff;
1347                 
1348         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1349             
1350             if (m->control_pressed) { //clean up
1351                 delete matrix; delete list;     delete cluster; delete rabund;
1352                 listFile.close();
1353                 m->mothurRemove(listFileName);  
1354                 return listFileName;
1355             }
1356             
1357             cluster->update(saveCutoff);
1358             
1359             float dist = matrix->getSmallDist();
1360             float rndDist;
1361             if (hard) {
1362                 rndDist = m->ceilDist(dist, precision); 
1363             }else{
1364                 rndDist = m->roundDist(dist, precision); 
1365             }
1366             
1367             if(previousDist <= 0.0000 && dist != previousDist){
1368                 oldList.setLabel("unique");
1369                 oldList.print(listFile);
1370                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1371             }
1372             else if(rndDist != rndPreviousDist){
1373                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1374                 oldList.print(listFile);
1375                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1376             }
1377             
1378             previousDist = dist;
1379             rndPreviousDist = rndDist;
1380             oldList = *list;
1381         }
1382         
1383                 
1384         if(previousDist <= 0.0000){
1385             oldList.setLabel("unique");
1386             oldList.print(listFile);
1387             if (labels.count("unique") == 0) { labels.insert("unique"); }
1388         }
1389         else if(rndPreviousDist<cutoff){
1390             oldList.setLabel(toString(rndPreviousDist,  length-1));
1391             oldList.print(listFile);
1392             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1393         }
1394         
1395         delete matrix; delete list;     delete cluster; delete rabund; 
1396         matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1397         listFile.close();
1398         
1399         if (m->control_pressed) { //clean up
1400             m->mothurRemove(listFileName);      
1401             return listFileName;
1402         }
1403         
1404         m->mothurRemove(thisDistFile);
1405         m->mothurRemove(thisNamefile);
1406         
1407         if (saveCutoff != cutoff) { 
1408             if (hard)   {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1409             else                {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1410                         
1411             m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  
1412         }
1413         
1414         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1415         
1416         return listFileName;
1417         
1418         }
1419         catch(exception& e) {
1420                 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1421                 exit(1);
1422         }
1423 }
1424 //**********************************************************************************************************************
1425
1426 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1427         try{
1428                 
1429 #ifdef USE_MPI
1430                 int pid;
1431                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1432                 
1433                 if (pid != 0) {
1434 #endif
1435                 
1436                 string thisOutputDir = outputDir;
1437                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1438         map<string, string> variables; 
1439         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
1440                 string outputFileName = getOutputFileName("column", variables);
1441                 m->mothurRemove(outputFileName);
1442                 
1443                 
1444                 for (int i = 0; i < distNames.size(); i++) {
1445                         if (m->control_pressed) {  return 0; }
1446                         
1447                         string thisDistFile = distNames[i].begin()->first;
1448                         
1449                         m->appendFiles(thisDistFile, outputFileName);
1450                 }       
1451                         
1452                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1453                         
1454 #ifdef USE_MPI
1455                 }
1456 #endif
1457                                 
1458                 return 0;       
1459                 
1460                 
1461         }
1462         catch(exception& e) {
1463                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1464                 exit(1);
1465         }
1466 }
1467 //**********************************************************************************************************************
1468 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1469     try {
1470         rabund->setLabel(list->getLabel());        
1471         for(int i = 0; i < list->getNumBins(); i++) { 
1472             if (m->control_pressed) { break; }
1473             vector<string> binNames;
1474             string bin = list->get(i);
1475             m->splitAtComma(bin, binNames);
1476             int total = 0;
1477             for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
1478             rabund->push_back(total);   
1479         }
1480         return 0;
1481     }
1482     catch(exception& e) {
1483                 m->errorOut(e, "ClusterCommand", "createRabund");
1484                 exit(1);
1485         }
1486     
1487 }
1488 //**********************************************************************************************************************