]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
changed added group output to indicator command. a few changes to work with the guy
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11
12
13 //**********************************************************************************************************************
14 vector<string> ClusterSplitCommand::setParameters(){    
15         try {
16                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
17                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
18                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
19                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
20         CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "",false,false); parameters.push_back(pcount);
21                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
22                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
23                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
24                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
25                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
26         CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pcluster);
27                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
28                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
29                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff);
30                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
31                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
32                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
33         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pclassic);
34                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
35                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
36                         
37                 vector<string> myArray;
38                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
39                 return myArray;
40         }
41         catch(exception& e) {
42                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
43                 exit(1);
44         }
45 }
46 //**********************************************************************************************************************
47 string ClusterSplitCommand::getHelpString(){    
48         try {
49                 string helpString = "";
50                 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
51                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
52                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
53                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
54                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
55                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
56                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
57                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
58                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
59                 helpString += "The name parameter allows you to enter your name file. \n";
60         helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
61         helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
62                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
63                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
64                 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
65                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
66                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
67                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
68                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
69         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic.  It is only valid with splitmethod=fasta. Default=f.\n";
70 #ifdef USE_MPI
71                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
72 #endif
73                 helpString += "The cluster.split command should be in the following format: \n";
74                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
75                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";       
76                 return helpString;
77         }
78         catch(exception& e) {
79                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
80                 exit(1);
81         }
82 }
83 //**********************************************************************************************************************
84 string ClusterSplitCommand::getOutputFileNameTag(string type, string inputName=""){     
85         try {
86         string outputFileName = "";
87                 map<string, vector<string> >::iterator it;
88         
89         //is this a type this command creates
90         it = outputTypes.find(type);
91         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
92         else {
93             if (type == "list") {  outputFileName =  "list"; }
94             else if (type == "rabund") {  outputFileName =  "rabund"; }
95             else if (type == "sabund") {  outputFileName =  "sabund"; }
96             else if (type == "column") {  outputFileName =  "dist"; }
97             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
98         }
99         return outputFileName;
100         }
101         catch(exception& e) {
102                 m->errorOut(e, "ClusterSplitCommand", "getOutputFileNameTag");
103                 exit(1);
104         }
105 }
106 //**********************************************************************************************************************
107 ClusterSplitCommand::ClusterSplitCommand(){     
108         try {
109                 abort = true; calledHelp = true; 
110                 setParameters();
111                 vector<string> tempOutNames;
112                 outputTypes["list"] = tempOutNames;
113                 outputTypes["rabund"] = tempOutNames;
114                 outputTypes["sabund"] = tempOutNames;
115                 outputTypes["column"] = tempOutNames;
116         }
117         catch(exception& e) {
118                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
119                 exit(1);
120         }
121 }
122 //**********************************************************************************************************************
123 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
124 ClusterSplitCommand::ClusterSplitCommand(string option)  {
125         try{
126                 abort = false; calledHelp = false;   
127                 format = "";
128                 
129                 //allow user to run help
130                 if(option == "help") { help(); abort = true; calledHelp = true; }
131                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
132                 
133                 else {
134                         vector<string> myArray = setParameters();
135                         
136                         OptionParser parser(option);
137                         map<string,string> parameters = parser.getParameters();
138                         
139                         ValidParameters validParameter("cluster.split");
140                 
141                         //check to make sure all parameters are valid for command
142                         map<string,string>::iterator it;
143                         for (it = parameters.begin(); it != parameters.end(); it++) { 
144                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
145                                         abort = true;
146                                 }
147                         }
148                         
149                         //initialize outputTypes
150                         vector<string> tempOutNames;
151                         outputTypes["list"] = tempOutNames;
152                         outputTypes["rabund"] = tempOutNames;
153                         outputTypes["sabund"] = tempOutNames;
154                         outputTypes["column"] = tempOutNames;
155                         
156                         //if the user changes the output directory command factory will send this info to us in the output parameter 
157                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
158                         
159                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
160                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
161                         if (inputDir == "not found"){   inputDir = "";          }
162                         else {
163                                 string path;
164                                 it = parameters.find("phylip");
165                                 //user has given a template file
166                                 if(it != parameters.end()){ 
167                                         path = m->hasPath(it->second);
168                                         //if the user has not given a path then, add inputdir. else leave path alone.
169                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
170                                 }
171                                 
172                                 it = parameters.find("column");
173                                 //user has given a template file
174                                 if(it != parameters.end()){ 
175                                         path = m->hasPath(it->second);
176                                         //if the user has not given a path then, add inputdir. else leave path alone.
177                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
178                                 }
179                                 
180                                 it = parameters.find("name");
181                                 //user has given a template file
182                                 if(it != parameters.end()){ 
183                                         path = m->hasPath(it->second);
184                                         //if the user has not given a path then, add inputdir. else leave path alone.
185                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
186                                 }
187                                 
188                                 it = parameters.find("taxonomy");
189                                 //user has given a template file
190                                 if(it != parameters.end()){ 
191                                         path = m->hasPath(it->second);
192                                         //if the user has not given a path then, add inputdir. else leave path alone.
193                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
194                                 }
195                                 
196                                 it = parameters.find("fasta");
197                                 //user has given a template file
198                                 if(it != parameters.end()){ 
199                                         path = m->hasPath(it->second);
200                                         //if the user has not given a path then, add inputdir. else leave path alone.
201                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
202                                 }
203                 
204                 it = parameters.find("count");
205                                 //user has given a template file
206                                 if(it != parameters.end()){ 
207                                         path = m->hasPath(it->second);
208                                         //if the user has not given a path then, add inputdir. else leave path alone.
209                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
210                                 }
211                         }
212                         
213                         //check for required parameters
214                         phylipfile = validParameter.validFile(parameters, "phylip", true);
215                         if (phylipfile == "not open") { abort = true; }
216                         else if (phylipfile == "not found") { phylipfile = ""; }        
217                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
218                         
219                         columnfile = validParameter.validFile(parameters, "column", true);
220                         if (columnfile == "not open") { abort = true; } 
221                         else if (columnfile == "not found") { columnfile = ""; }
222                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
223                         
224                         namefile = validParameter.validFile(parameters, "name", true);
225                         if (namefile == "not open") { abort = true; namefile = "";}     
226                         else if (namefile == "not found") { namefile = "";  }
227                         else { m->setNameFile(namefile); }
228             
229             countfile = validParameter.validFile(parameters, "count", true);
230                         if (countfile == "not open") { abort = true; countfile = "";}   
231                         else if (countfile == "not found") { countfile = "";  }
232                         else { m->setCountTableFile(countfile); }
233                         
234                         fastafile = validParameter.validFile(parameters, "fasta", true);
235                         if (fastafile == "not open") { abort = true; }  
236                         else if (fastafile == "not found") { fastafile = ""; }
237                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
238                         
239                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
240                         if (taxFile == "not open") { taxFile = ""; abort = true; }      
241                         else if (taxFile == "not found") { taxFile = ""; }
242                         else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
243                         
244                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { 
245                                 //is there are current file available for either of these?
246                                 //give priority to column, then phylip, then fasta
247                                 columnfile = m->getColumnFile(); 
248                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
249                                 else { 
250                                         phylipfile = m->getPhylipFile(); 
251                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
252                                         else { 
253                                                 fastafile = m->getFastaFile(); 
254                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
255                                                 else { 
256                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); 
257                                                         abort = true; 
258                                                 }
259                                         }
260                                 }
261                         }
262                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
263             
264             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
265             
266                         if (columnfile != "") {
267                                 if ((namefile == "") && (countfile == "")) { 
268                                         namefile = m->getNameFile(); 
269                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
270                                         else { 
271                                                 countfile = m->getCountTableFile();
272                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
273                         else { 
274                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine(); 
275                             abort = true; 
276                         }       
277                                         }       
278                                 }
279                         }
280                         
281                         if (fastafile != "") {
282                                 if (taxFile == "") { 
283                                         taxFile = m->getTaxonomyFile(); 
284                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
285                                         else { 
286                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine(); 
287                                                 abort = true; 
288                                         }       
289                                 }
290                                 
291                                 if ((namefile == "") && (countfile == "")) { 
292                                         namefile = m->getNameFile(); 
293                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
294                                         else { 
295                                                 countfile = m->getCountTableFile();
296                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
297                         else { 
298                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine(); 
299                             abort = true; 
300                         }       
301                                         }       
302                                 }
303                         }
304                                         
305                         //check for optional parameter and set defaults
306                         // ...at some point should added some additional type checking...
307                         //get user cutoff and precision or use defaults
308                         string temp;
309                         temp = validParameter.validFile(parameters, "precision", false);
310                         if (temp == "not found") { temp = "100"; }
311                         //saves precision legnth for formatting below
312                         length = temp.length();
313                         m->mothurConvert(temp, precision); 
314                         
315                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
316                         hard = m->isTrue(temp);
317                         
318                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
319                         large = m->isTrue(temp);
320             
321                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
322                         m->setProcessors(temp);
323                         m->mothurConvert(temp, processors);
324                         
325                         temp = validParameter.validFile(parameters, "splitmethod", false);      
326                         if ((splitmethod != "fasta") && (splitmethod != "classify")) {
327                                 if (temp == "not found")  { splitmethod = "distance"; }
328                                 else {  splitmethod = temp; }
329                         }
330                         
331             temp = validParameter.validFile(parameters, "classic", false);                      if (temp == "not found") { temp = "F"; }
332                         classic = m->isTrue(temp);
333             
334             if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
335
336                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
337                         m->mothurConvert(temp, cutoff); 
338                         cutoff += (5 / (precision * 10.0));  
339                         
340                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
341                         m->mothurConvert(temp, taxLevelCutoff); 
342                         
343                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
344                         
345                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
346                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
347                         
348                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
349                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
350                         
351                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
352
353                         showabund = validParameter.validFile(parameters, "showabund", false);
354                         if (showabund == "not found") { showabund = "T"; }
355             
356             temp = validParameter.validFile(parameters, "cluster", false);  if (temp == "not found") { temp = "T"; }
357             runCluster = m->isTrue(temp);
358
359                         timing = validParameter.validFile(parameters, "timing", false);
360                         if (timing == "not found") { timing = "F"; }
361                         
362                 }
363         }
364         catch(exception& e) {
365                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
366                 exit(1);
367         }
368 }
369
370 //**********************************************************************************************************************
371
372 int ClusterSplitCommand::execute(){
373         try {
374         
375                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
376                 
377                 time_t estart;
378                 vector<string> listFileNames;
379                 set<string> labels;
380                 string singletonName = "";
381                 double saveCutoff = cutoff;
382
383                 //****************** file prep work ******************************//
384                 #ifdef USE_MPI
385                         int pid;
386                         int tag = 2001;
387                         MPI_Status status; 
388                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
389                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
390                         
391                         if (pid == 0) { //only process 0 converts and splits
392                         
393                 #endif
394                 
395                 //if user gave a phylip file convert to column file
396                 if (format == "phylip") {
397                         estart = time(NULL);
398                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
399                         
400                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
401                         
402                         NameAssignment* nameMap = NULL;
403                         convert->setFormat("phylip");
404                         convert->read(nameMap);
405                         
406                         if (m->control_pressed) {  delete convert;  return 0;  }
407                         
408                         distfile = convert->getOutputFile();
409                 
410                         //if no names file given with phylip file, create it
411                         ListVector* listToMakeNameFile =  convert->getListVector();
412                         if ((namefile == "") && (countfile == "")) {  //you need to make a namefile for split matrix
413                                 ofstream out;
414                                 namefile = phylipfile + ".names";
415                                 m->openOutputFile(namefile, out);
416                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
417                                         string bin = listToMakeNameFile->get(i);
418                                         out << bin << '\t' << bin << endl;
419                                 }
420                                 out.close();
421                         }
422                         delete listToMakeNameFile;
423                         delete convert;
424                         
425                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
426                 }
427                 if (m->control_pressed) { return 0; }
428                 
429                 estart = time(NULL);
430                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
431                 
432                 //split matrix into non-overlapping groups
433                 SplitMatrix* split;
434                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large);                                                    }
435                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large);                                    }
436                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);  }
437                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
438                 
439                 split->split();
440                 
441                 if (m->control_pressed) { delete split; return 0; }
442                 
443                 singletonName = split->getSingletonNames();
444                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
445                 delete split;
446                 
447                 //output a merged distance file
448                 if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
449                         
450                                 
451                 if (m->control_pressed) { return 0; }
452                 
453                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
454                 estart = time(NULL);
455                 
456         if (!runCluster) { 
457 #ifdef USE_MPI
458     }
459 #endif  
460                 m->mothurOutEndLine();
461                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
462                 for (int i = 0; i < distName.size(); i++) {     m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine();      }
463                 m->mothurOutEndLine();
464                 return 0;
465                 
466         }
467                 
468                 //****************** break up files between processes and cluster each file set ******************************//
469         #ifdef USE_MPI
470                         ////you are process 0 from above////
471                         
472                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...                           
473                         dividedNames.resize(processors);
474                                         
475                         //for each file group figure out which process will complete it
476                         //want to divide the load intelligently so the big files are spread between processes
477                         for (int i = 0; i < distName.size(); i++) { 
478                                 int processToAssign = (i+1) % processors; 
479                                 if (processToAssign == 0) { processToAssign = processors; }
480                                                 
481                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
482                         }
483                                         
484                         //not lets reverse the order of ever other process, so we balance big files running with little ones
485                         for (int i = 0; i < processors; i++) {
486                                 int remainder = ((i+1) % processors);
487                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
488                         }
489                         
490                         
491                         //send each child the list of files it needs to process
492                         for(int i = 1; i < processors; i++) { 
493                                 //send number of file pairs
494                                 int num = dividedNames[i].size();
495                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
496                                 
497                                 for (int j = 0; j < num; j++) { //send filenames to process i
498                                         char tempDistFileName[1024];
499                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
500                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
501                                         
502                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
503                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
504                                         
505                                         char tempNameFileName[1024];
506                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
507                                         int lengthName = (dividedNames[i][j].begin()->second).length();
508
509                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
510                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
511                                 }
512                         }
513                         
514                         //process your share
515                         listFileNames = cluster(dividedNames[0], labels);
516                         
517                         //receive the other processes info
518                         for(int i = 1; i < processors; i++) { 
519                                 int num = dividedNames[i].size();
520                                 
521                                 double tempCutoff;
522                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
523                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
524                                 
525                                 //send list filenames to root process
526                                 for (int j = 0; j < num; j++) {  
527                                         int lengthList = 0;
528                                         char tempListFileName[1024];
529                                 
530                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
531                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
532                                 
533                                         string myListFileName = tempListFileName;
534                                         myListFileName = myListFileName.substr(0, lengthList);
535                                         
536                                         listFileNames.push_back(myListFileName);
537                                 }
538                                 
539                                 //send Labels to root process
540                                 int numLabels = 0;
541                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
542                                 
543                                 for (int j = 0; j < numLabels; j++) {  
544                                         int lengthLabel = 0;
545                                         char tempLabel[100];
546                                 
547                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
548                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
549                                 
550                                         string myLabel = tempLabel;
551                                         myLabel = myLabel.substr(0, lengthLabel);
552                         
553                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
554                                 }
555                         }
556                         
557                 }else { //you are a child process
558                         vector < map<string, string> >  myNames;
559                         
560                         //recieve the files you need to process
561                         //receive number of file pairs
562                         int num = 0;
563                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
564                         
565                         myNames.resize(num);
566         
567                         for (int j = 0; j < num; j++) { //receive filenames to process 
568                                 int lengthDist = 0;
569                                 char tempDistFileName[1024];
570                                 
571                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
572                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
573                                 
574                                 string myDistFileName = tempDistFileName;
575                                 myDistFileName = myDistFileName.substr(0, lengthDist);
576                         
577                                 int lengthName = 0;
578                                 char tempNameFileName[1024];
579                                 
580                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
581                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
582                                 
583                                 string myNameFileName = tempNameFileName;
584                                 myNameFileName = myNameFileName.substr(0, lengthName);
585                                 
586                                 //save file name
587                                 myNames[j][myDistFileName] = myNameFileName;
588                         }
589         
590                         //process them
591                         listFileNames = cluster(myNames, labels);
592                         
593                         //send cutoff
594                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
595                         
596                         //send list filenames to root process
597                         for (int j = 0; j < num; j++) {  
598                                 char tempListFileName[1024];
599                                 strcpy(tempListFileName, listFileNames[j].c_str());
600                                 int lengthList = listFileNames[j].length();
601                                         
602                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
603                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
604                         }
605                         
606                         //send Labels to root process
607                         int numLabels = labels.size();
608                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
609                         
610                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
611                                 char tempLabel[100];
612                                 strcpy(tempLabel, (*it).c_str());
613                                 int lengthLabel = (*it).length();
614                                         
615                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
616                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
617                         }
618                 }
619                 
620                 //make everyone wait
621                 MPI_Barrier(MPI_COMM_WORLD);
622                 
623         #else
624                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
625                 //sanity check
626                 if (processors > distName.size()) { processors = distName.size(); }
627                 
628                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
629                                 if(processors == 1){
630                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
631                                 }else{
632                                         listFileNames = createProcesses(distName, labels);
633                 }
634                 #else
635                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
636                 #endif
637         #endif  
638                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
639                 
640                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
641                 
642                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
643                 
644                 //****************** merge list file and create rabund and sabund files ******************************//
645                 estart = time(NULL);
646                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
647                 
648                 #ifdef USE_MPI
649                         if (pid == 0) { //only process 0 merges
650                 #endif
651
652                 ListVector* listSingle;
653                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
654                 
655                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
656                 
657                 mergeLists(listFileNames, labelBins, listSingle);
658
659                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
660                 
661                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
662                 
663                 //set list file as new current listfile
664                 string current = "";
665                 itTypes = outputTypes.find("list");
666                 if (itTypes != outputTypes.end()) {
667                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
668                 }
669                 
670                 //set rabund file as new current rabundfile
671                 itTypes = outputTypes.find("rabund");
672                 if (itTypes != outputTypes.end()) {
673                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
674                 }
675                 
676                 //set sabund file as new current sabundfile
677                 itTypes = outputTypes.find("sabund");
678                 if (itTypes != outputTypes.end()) {
679                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
680                 }
681                                 
682                 m->mothurOutEndLine();
683                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
684                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
685                 m->mothurOutEndLine();
686                 
687                 #ifdef USE_MPI
688                         } //only process 0 merges
689                         
690                         //make everyone wait
691                         MPI_Barrier(MPI_COMM_WORLD);
692                 #endif
693
694                 return 0;
695         }
696         catch(exception& e) {
697                 m->errorOut(e, "ClusterSplitCommand", "execute");
698                 exit(1);
699         }
700 }
701 //**********************************************************************************************************************
702 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
703         try {
704                                 
705                 map<float, int> labelBin;
706                 vector<float> orderFloat;
707                 int numSingleBins;
708                 
709                 //read in singletons
710                 if (singleton != "none") {
711             
712             ifstream in;
713             m->openInputFile(singleton, in);
714                                 
715                         string firstCol, secondCol;
716                         listSingle = new ListVector();
717             
718             if (countfile != "") { m->getline(in); m->gobble(in); }
719             
720                         while (!in.eof()) {
721                                 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
722                                 if (countfile == "") { listSingle->push_back(secondCol); }
723                 else { listSingle->push_back(firstCol); }
724                         }
725             
726                         in.close();
727                         m->mothurRemove(singleton);
728                         
729                         numSingleBins = listSingle->getNumBins();
730                 }else{  listSingle = NULL; numSingleBins = 0;  }
731                 
732                 //go through users set and make them floats so we can sort them 
733                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
734                         float temp = -10.0;
735
736                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
737                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
738                         
739                         if (temp <= cutoff) {
740                                 orderFloat.push_back(temp);
741                                 labelBin[temp] = numSingleBins; //initialize numbins 
742                         }
743                 }
744         
745                 //sort order
746                 sort(orderFloat.begin(), orderFloat.end());
747                 userLabels.clear();
748                         
749                 //get the list info from each file
750                 for (int k = 0; k < listNames.size(); k++) {
751         
752                         if (m->control_pressed) {  
753                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
754                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
755                                 return labelBin;
756                         }
757                         
758                         InputData* input = new InputData(listNames[k], "list");
759                         ListVector* list = input->getListVector();
760                         string lastLabel = list->getLabel();
761                         
762                         string filledInList = listNames[k] + "filledInTemp";
763                         ofstream outFilled;
764                         m->openOutputFile(filledInList, outFilled);
765         
766                         //for each label needed
767                         for(int l = 0; l < orderFloat.size(); l++){
768                         
769                                 string thisLabel;
770                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
771                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
772
773                                 //this file has reached the end
774                                 if (list == NULL) { 
775                                         list = input->getListVector(lastLabel, true); 
776                                 }else{  //do you have the distance, or do you need to fill in
777                                                 
778                                         float labelFloat;
779                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
780                                         else { convert(list->getLabel(), labelFloat); }
781
782                                         //check for missing labels
783                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
784                                                 //if its bigger get last label, otherwise keep it
785                                                 delete list;
786                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
787                                         }
788                                         lastLabel = list->getLabel();
789                                 }
790                                 
791                                 //print to new file
792                                 list->setLabel(thisLabel);
793                                 list->print(outFilled);
794                 
795                                 //update labelBin
796                                 labelBin[orderFloat[l]] += list->getNumBins();
797                                                                         
798                                 delete list;
799                                                                         
800                                 list = input->getListVector();
801                         }
802                         
803                         if (list != NULL) { delete list; }
804                         delete input;
805                         
806                         outFilled.close();
807                         m->mothurRemove(listNames[k]);
808                         rename(filledInList.c_str(), listNames[k].c_str());
809                 }
810                 
811                 return labelBin;
812         }
813         catch(exception& e) {
814                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
815                 exit(1);
816         }
817 }
818 //**********************************************************************************************************************
819 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
820         try {
821                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
822                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
823                 
824         string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund");
825         string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund");
826         string listFileName = fileroot+ tag + ".";
827         if (countfile != "") { listFileName += "unique_"; }
828         listFileName += getOutputFileNameTag("list");
829         
830         if (countfile == "") {
831             m->openOutputFile(sabundFileName,   outSabund);
832             m->openOutputFile(rabundFileName,   outRabund);
833             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
834             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
835             
836         }
837                 m->openOutputFile(listFileName, outList);
838         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
839                 
840                 
841                 map<float, int>::iterator itLabel;
842
843                 //for each label needed
844                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
845                         
846                         string thisLabel;
847                         if (itLabel->first == -1) { thisLabel = "unique"; }
848                         else { thisLabel = toString(itLabel->first,  length-1);  } 
849                         
850                         outList << thisLabel << '\t' << itLabel->second << '\t';
851             
852             RAbundVector* rabund = NULL;
853             if (countfile == "") {
854                 rabund = new RAbundVector();
855                 rabund->setLabel(thisLabel);
856             }
857
858                         //add in singletons
859                         if (listSingle != NULL) {
860                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
861                                         outList << listSingle->get(j) << '\t';
862                                         if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
863                                 }
864                         }
865                         
866                         //get the list info from each file
867                         for (int k = 0; k < listNames.size(); k++) {
868         
869                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } if (rabund != NULL) { delete rabund; } return 0; }
870                                 
871                                 InputData* input = new InputData(listNames[k], "list");
872                                 ListVector* list = input->getListVector(thisLabel);
873                                 
874                                 //this file has reached the end
875                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
876                                 else {          
877                                         for (int j = 0; j < list->getNumBins(); j++) {
878                                                 outList << list->get(j) << '\t';
879                                                 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
880                                         }
881                                         delete list;
882                                 }
883                                 delete input;
884                         }
885                         
886             if (countfile == "") {
887                 SAbundVector sabund = rabund->getSAbundVector();
888                 sabund.print(outSabund);
889                 rabund->print(outRabund);
890             }
891                         outList << endl;
892                         
893                         if (rabund != NULL) { delete rabund; }
894                 }
895                 
896                 outList.close();
897         if (countfile == "") {
898             outRabund.close();
899             outSabund.close();
900                 }
901                 if (listSingle != NULL) { delete listSingle;  }
902                 
903                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
904                 
905                 return 0;
906         }
907         catch(exception& e) {
908                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
909                 exit(1);
910         }
911 }
912
913 //**********************************************************************************************************************
914
915 void ClusterSplitCommand::printData(ListVector* oldList){
916         try {
917                 string label = oldList->getLabel();
918                 RAbundVector oldRAbund = oldList->getRAbundVector();
919                 
920                 oldRAbund.setLabel(label);
921                 if (m->isTrue(showabund)) {
922                         oldRAbund.getSAbundVector().print(cout);
923                 }
924                 oldRAbund.print(outRabund);
925                 oldRAbund.getSAbundVector().print(outSabund);
926         
927                 oldList->print(outList);
928         }
929         catch(exception& e) {
930                 m->errorOut(e, "ClusterSplitCommand", "printData");
931                 exit(1);
932         }
933 }
934 //**********************************************************************************************************************
935 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
936         try {
937         
938         vector<string> listFiles;
939         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
940         dividedNames.resize(processors);
941         
942         //for each file group figure out which process will complete it
943         //want to divide the load intelligently so the big files are spread between processes
944         for (int i = 0; i < distName.size(); i++) { 
945             //cout << i << endl;
946             int processToAssign = (i+1) % processors; 
947             if (processToAssign == 0) { processToAssign = processors; }
948             
949             dividedNames[(processToAssign-1)].push_back(distName[i]);
950             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
951         }
952         
953         //not lets reverse the order of ever other process, so we balance big files running with little ones
954         for (int i = 0; i < processors; i++) {
955             //cout << i << endl;
956             int remainder = ((i+1) % processors);
957             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
958         }
959         
960         if (m->control_pressed) { return listFiles; }
961         
962         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
963                 int process = 1;
964                 processIDS.clear();
965                 
966                 //loop through and create all the processes you want
967                 while (process != processors) {
968                         int pid = fork();
969                         
970                         if (pid > 0) {
971                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
972                                 process++;
973                         }else if (pid == 0){
974                                 set<string> labels;
975                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
976                                 
977                                 //write out names to file
978                                 string filename = toString(getpid()) + ".temp";
979                                 ofstream out;
980                                 m->openOutputFile(filename, out);
981                                 out << tag << endl;
982                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
983                                 out.close();
984                                 
985                                 //print out labels
986                                 ofstream outLabels;
987                                 filename = toString(getpid()) + ".temp.labels";
988                                 m->openOutputFile(filename, outLabels);
989                                 
990                                 outLabels << cutoff << endl;
991                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
992                                         outLabels << (*it) << endl;
993                                 }
994                                 outLabels.close();
995
996                                 exit(0);
997                         }else { 
998                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
999                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1000                                 exit(0);
1001                         }
1002                 }
1003                 
1004         //do your part
1005         listFiles = cluster(dividedNames[0], labels);
1006         
1007                 //force parent to wait until all the processes are done
1008                 for (int i=0;i< processIDS.size();i++) { 
1009                         int temp = processIDS[i];
1010                         wait(&temp);
1011                 }
1012         
1013         //get list of list file names from each process
1014         for(int i=0;i<processIDS.size();i++){
1015             string filename = toString(processIDS[i]) + ".temp";
1016             ifstream in;
1017             m->openInputFile(filename, in);
1018             
1019             in >> tag; m->gobble(in);
1020             
1021             while(!in.eof()) {
1022                 string tempName;
1023                 in >> tempName; m->gobble(in);
1024                 listFiles.push_back(tempName);
1025             }
1026             in.close();
1027             m->mothurRemove((toString(processIDS[i]) + ".temp"));
1028             
1029             //get labels
1030             filename = toString(processIDS[i]) + ".temp.labels";
1031             ifstream in2;
1032             m->openInputFile(filename, in2);
1033             
1034             float tempCutoff;
1035             in2 >> tempCutoff; m->gobble(in2);
1036             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1037             
1038             while(!in2.eof()) {
1039                 string tempName;
1040                 in2 >> tempName; m->gobble(in2);
1041                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1042             }
1043             in2.close();
1044             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1045         }
1046         
1047
1048     #else
1049        
1050         //////////////////////////////////////////////////////////////////////////////////////////////////////
1051                 //Windows version shared memory, so be careful when passing variables through the clusterData struct. 
1052                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1053                 //Taking advantage of shared memory to allow both threads to add labels.
1054                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1055                 /*
1056                 vector<clusterData*> pDataArray; 
1057                 DWORD   dwThreadIdArray[processors-1];
1058                 HANDLE  hThreadArray[processors-1]; 
1059                 
1060                 //Create processor worker threads.
1061                 for( int i=1; i<processors; i++ ){
1062                         // Allocate memory for thread data.
1063                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1064                         pDataArray.push_back(tempCluster);
1065                         processIDS.push_back(i);
1066             
1067                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1068                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1069                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);  
1070             
1071                 }
1072         
1073         //do your part
1074         listFiles = cluster(dividedNames[0], labels);
1075         
1076                 //Wait until all threads have terminated.
1077                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1078                 
1079                 //Close all thread handles and free memory allocations.
1080                 for(int i=0; i < pDataArray.size(); i++){
1081             //get tag
1082             tag = pDataArray[i]->tag;
1083             //get listfiles created
1084             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1085             //get labels
1086             set<string>::iterator it;
1087             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1088                         //check cutoff
1089             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1090                         CloseHandle(hThreadArray[i]);
1091                         delete pDataArray[i];
1092                 }
1093 */
1094         #endif          
1095         
1096         return listFiles;
1097         
1098         }
1099         catch(exception& e) {
1100                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1101                 exit(1);
1102         }
1103 }
1104 //**********************************************************************************************************************
1105
1106 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1107         try {
1108                 
1109                 vector<string> listFileNames;
1110                 double smallestCutoff = cutoff;
1111                 
1112                 //cluster each distance file
1113                 for (int i = 0; i < distNames.size(); i++) {
1114             
1115                         string thisNamefile = distNames[i].begin()->second;
1116                         string thisDistFile = distNames[i].begin()->first;
1117                         
1118                         string listFileName = "";
1119             if (classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff);   }
1120             else            {  listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff);          }
1121
1122                         if (m->control_pressed) { //clean up
1123                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1124                                 listFileNames.clear(); return listFileNames;
1125                         }
1126             
1127             listFileNames.push_back(listFileName);
1128         }
1129                 
1130                 cutoff = smallestCutoff;
1131                                         
1132                 return listFileNames;
1133         
1134         }
1135         catch(exception& e) {
1136                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1137                 exit(1);
1138         }
1139
1140
1141 }
1142 //**********************************************************************************************************************
1143 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1144         try {
1145         string listFileName = "";
1146         
1147         ListVector* list = NULL;
1148         ListVector oldList;
1149         RAbundVector* rabund = NULL;
1150         
1151 #ifdef USE_MPI
1152         int pid;
1153         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1154         
1155         //output your files too
1156         if (pid != 0) {
1157             cout << endl << "Reading " << thisDistFile << endl;
1158         }
1159 #endif
1160
1161         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1162         
1163         //reads phylip file storing data in 2D vector, also fills list and rabund
1164         bool sim = false;
1165                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1166         
1167         NameAssignment* nameMap = NULL;
1168         CountTable* ct = NULL;
1169         if(namefile != ""){     
1170                         nameMap = new NameAssignment(thisNamefile);
1171                         nameMap->readMap();
1172             cluster->readPhylipFile(thisDistFile, nameMap);
1173                 }else if (countfile != "") {
1174             ct = new CountTable();
1175             ct->readTable(thisNamefile);
1176             cluster->readPhylipFile(thisDistFile, ct);
1177         }
1178         tag = cluster->getTag();
1179         
1180                 if (m->control_pressed) { if(namefile != ""){   delete nameMap; }
1181             else { delete ct; } delete cluster; return 0; }
1182                 
1183                 list = cluster->getListVector();
1184                 rabund = cluster->getRAbundVector();
1185         
1186                 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1187                 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1188         listFileName = fileroot+ tag + ".list";
1189         
1190         ofstream listFile;
1191                 m->openOutputFile(fileroot+ tag + ".list",      listFile);
1192                 
1193                 float previousDist = 0.00000;
1194                 float rndPreviousDist = 0.00000;
1195                 oldList = *list;
1196                 
1197 #ifdef USE_MPI
1198         //output your files too
1199         if (pid != 0) {
1200             cout << endl << "Clustering " << thisDistFile << endl;
1201         }
1202 #endif
1203
1204         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1205         
1206                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1207                         if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  if(namefile != ""){    delete nameMap; }
1208                 else { delete ct; } return listFileName;  }
1209             
1210                         cluster->update(cutoff);
1211             
1212                         float dist = cluster->getSmallDist();
1213                         float rndDist;
1214                         if (hard) {
1215                                 rndDist = m->ceilDist(dist, precision); 
1216                         }else{
1217                                 rndDist = m->roundDist(dist, precision); 
1218                         }
1219             
1220             if(previousDist <= 0.0000 && dist != previousDist){
1221                 oldList.setLabel("unique");
1222                 oldList.print(listFile);
1223                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1224             }
1225             else if(rndDist != rndPreviousDist){
1226                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1227                 oldList.print(listFile);
1228                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1229             }
1230
1231             
1232                         previousDist = dist;
1233                         rndPreviousDist = rndDist;
1234                         oldList = *list;
1235                 }
1236         
1237                 if(previousDist <= 0.0000){
1238             oldList.setLabel("unique");
1239             oldList.print(listFile);
1240             if (labels.count("unique") == 0) { labels.insert("unique"); }
1241         }
1242         else if(rndPreviousDist<cutoff){
1243             oldList.setLabel(toString(rndPreviousDist,  length-1));
1244             oldList.print(listFile);
1245             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1246         }
1247
1248         
1249                 listFile.close();
1250                 
1251                 delete cluster;  delete list; delete rabund;
1252         if(namefile != ""){     delete nameMap; }
1253         else { delete ct; }
1254         
1255         m->mothurRemove(thisDistFile);
1256         m->mothurRemove(thisNamefile);
1257         
1258         return listFileName;
1259         
1260         }
1261         catch(exception& e) {
1262                 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1263                 exit(1);
1264         }
1265 }
1266
1267 //**********************************************************************************************************************
1268 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1269         try {
1270         string listFileName = "";
1271         
1272         Cluster* cluster = NULL;
1273         SparseDistanceMatrix* matrix = NULL;
1274         ListVector* list = NULL;
1275         ListVector oldList;
1276         RAbundVector* rabund = NULL;
1277         
1278         if (m->control_pressed) { return listFileName; }
1279         
1280 #ifdef USE_MPI
1281         int pid;
1282         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1283         
1284         //output your files too
1285         if (pid != 0) {
1286             cout << endl << "Reading " << thisDistFile << endl;
1287         }
1288 #endif
1289         
1290         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1291         
1292         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
1293         read->setCutoff(cutoff);
1294         
1295         NameAssignment* nameMap = NULL;
1296         CountTable* ct = NULL;
1297                 if(namefile != ""){     
1298                         nameMap = new NameAssignment(thisNamefile);
1299                         nameMap->readMap();
1300             read->read(nameMap);
1301                 }else if (countfile != "") {
1302             ct = new CountTable();
1303             ct->readTable(thisNamefile);
1304             read->read(ct);
1305         }else { read->read(nameMap); }
1306                 
1307                 list = read->getListVector();
1308         oldList = *list;
1309                 matrix = read->getDMatrix();
1310         
1311                 if(countfile != "") {
1312             rabund = new RAbundVector();
1313             createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1314             delete ct;
1315         }else { rabund = new RAbundVector(list->getRAbundVector()); }
1316
1317         delete read;  read = NULL;
1318         if (namefile != "") { delete nameMap; nameMap = NULL; }
1319         
1320         
1321 #ifdef USE_MPI
1322         //output your files too
1323         if (pid != 0) {
1324             cout << endl << "Clustering " << thisDistFile << endl;
1325         }
1326 #endif
1327         
1328         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1329                 
1330         //create cluster
1331         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1332         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1333         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
1334         tag = cluster->getTag();
1335                 
1336         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1337         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1338         
1339         ofstream listFile;
1340         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1341                 
1342         listFileName = fileroot+ tag + ".list";
1343         
1344         float previousDist = 0.00000;
1345         float rndPreviousDist = 0.00000;
1346         
1347         oldList = *list;
1348         
1349         print_start = true;
1350         start = time(NULL);
1351         double saveCutoff = cutoff;
1352                 
1353         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1354             
1355             if (m->control_pressed) { //clean up
1356                 delete matrix; delete list;     delete cluster; delete rabund;
1357                 listFile.close();
1358                 m->mothurRemove(listFileName);  
1359                 return listFileName;
1360             }
1361             
1362             cluster->update(saveCutoff);
1363             
1364             float dist = matrix->getSmallDist();
1365             float rndDist;
1366             if (hard) {
1367                 rndDist = m->ceilDist(dist, precision); 
1368             }else{
1369                 rndDist = m->roundDist(dist, precision); 
1370             }
1371             
1372             if(previousDist <= 0.0000 && dist != previousDist){
1373                 oldList.setLabel("unique");
1374                 oldList.print(listFile);
1375                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1376             }
1377             else if(rndDist != rndPreviousDist){
1378                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1379                 oldList.print(listFile);
1380                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1381             }
1382             
1383             previousDist = dist;
1384             rndPreviousDist = rndDist;
1385             oldList = *list;
1386         }
1387         
1388                 
1389         if(previousDist <= 0.0000){
1390             oldList.setLabel("unique");
1391             oldList.print(listFile);
1392             if (labels.count("unique") == 0) { labels.insert("unique"); }
1393         }
1394         else if(rndPreviousDist<cutoff){
1395             oldList.setLabel(toString(rndPreviousDist,  length-1));
1396             oldList.print(listFile);
1397             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1398         }
1399         
1400         delete matrix; delete list;     delete cluster; delete rabund; 
1401         matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1402         listFile.close();
1403         
1404         if (m->control_pressed) { //clean up
1405             m->mothurRemove(listFileName);      
1406             return listFileName;
1407         }
1408         
1409         m->mothurRemove(thisDistFile);
1410         m->mothurRemove(thisNamefile);
1411         
1412         if (saveCutoff != cutoff) { 
1413             if (hard)   {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1414             else                {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1415                         
1416             m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  
1417         }
1418         
1419         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1420         
1421         return listFileName;
1422         
1423         }
1424         catch(exception& e) {
1425                 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1426                 exit(1);
1427         }
1428 }
1429 //**********************************************************************************************************************
1430
1431 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1432         try{
1433                 
1434 #ifdef USE_MPI
1435                 int pid;
1436                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1437                 
1438                 if (pid != 0) {
1439 #endif
1440                 
1441                 string thisOutputDir = outputDir;
1442                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1443                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("column");
1444                 m->mothurRemove(outputFileName);
1445                 
1446                 
1447                 for (int i = 0; i < distNames.size(); i++) {
1448                         if (m->control_pressed) {  return 0; }
1449                         
1450                         string thisDistFile = distNames[i].begin()->first;
1451                         
1452                         m->appendFiles(thisDistFile, outputFileName);
1453                 }       
1454                         
1455                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1456                         
1457 #ifdef USE_MPI
1458                 }
1459 #endif
1460                                 
1461                 return 0;       
1462                 
1463                 
1464         }
1465         catch(exception& e) {
1466                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1467                 exit(1);
1468         }
1469 }
1470 //**********************************************************************************************************************
1471 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1472     try {
1473         rabund->setLabel(list->getLabel());        
1474         for(int i = 0; i < list->getNumBins(); i++) { 
1475             if (m->control_pressed) { break; }
1476             vector<string> binNames;
1477             string bin = list->get(i);
1478             m->splitAtComma(bin, binNames);
1479             int total = 0;
1480             for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
1481             rabund->push_back(total);   
1482         }
1483         return 0;
1484     }
1485     catch(exception& e) {
1486                 m->errorOut(e, "ClusterCommand", "createRabund");
1487                 exit(1);
1488         }
1489     
1490 }
1491 //**********************************************************************************************************************