]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
Merge remote-tracking branch 'mothur/master'
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11
12
13 //**********************************************************************************************************************
14 vector<string> ClusterSplitCommand::setParameters(){    
15         try {
16                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
17                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
18                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
19                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
20         CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "",false,false); parameters.push_back(pcount);
21                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
22                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
23                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
24                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
25                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
26                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
27                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
28                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff);
29                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
30                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
31                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
32         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pclassic);
33                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
34                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
35                         
36                 vector<string> myArray;
37                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
38                 return myArray;
39         }
40         catch(exception& e) {
41                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
42                 exit(1);
43         }
44 }
45 //**********************************************************************************************************************
46 string ClusterSplitCommand::getHelpString(){    
47         try {
48                 string helpString = "";
49                 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n";
50                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
51                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
52                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
53                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
54                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
55                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
56                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
57                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
58                 helpString += "The name parameter allows you to enter your name file. \n";
59         helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
60                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
61                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
62                 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
63                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
64                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
65                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
66                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
67         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic.  It is only valid with splitmethod=fasta. Default=f.\n";
68 #ifdef USE_MPI
69                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
70 #endif
71                 helpString += "The cluster.split command should be in the following format: \n";
72                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
73                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";       
74                 return helpString;
75         }
76         catch(exception& e) {
77                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
78                 exit(1);
79         }
80 }
81 //**********************************************************************************************************************
82 string ClusterSplitCommand::getOutputFileNameTag(string type, string inputName=""){     
83         try {
84         string outputFileName = "";
85                 map<string, vector<string> >::iterator it;
86         
87         //is this a type this command creates
88         it = outputTypes.find(type);
89         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
90         else {
91             if (type == "list") {  outputFileName =  "list"; }
92             else if (type == "rabund") {  outputFileName =  "rabund"; }
93             else if (type == "sabund") {  outputFileName =  "sabund"; }
94             else if (type == "column") {  outputFileName =  "dist"; }
95             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
96         }
97         return outputFileName;
98         }
99         catch(exception& e) {
100                 m->errorOut(e, "ClusterSplitCommand", "getOutputFileNameTag");
101                 exit(1);
102         }
103 }
104 //**********************************************************************************************************************
105 ClusterSplitCommand::ClusterSplitCommand(){     
106         try {
107                 abort = true; calledHelp = true; 
108                 setParameters();
109                 vector<string> tempOutNames;
110                 outputTypes["list"] = tempOutNames;
111                 outputTypes["rabund"] = tempOutNames;
112                 outputTypes["sabund"] = tempOutNames;
113                 outputTypes["column"] = tempOutNames;
114         }
115         catch(exception& e) {
116                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
117                 exit(1);
118         }
119 }
120 //**********************************************************************************************************************
121 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
122 ClusterSplitCommand::ClusterSplitCommand(string option)  {
123         try{
124                 abort = false; calledHelp = false;   
125                 format = "";
126                 
127                 //allow user to run help
128                 if(option == "help") { help(); abort = true; calledHelp = true; }
129                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
130                 
131                 else {
132                         vector<string> myArray = setParameters();
133                         
134                         OptionParser parser(option);
135                         map<string,string> parameters = parser.getParameters();
136                         
137                         ValidParameters validParameter("cluster.split");
138                 
139                         //check to make sure all parameters are valid for command
140                         map<string,string>::iterator it;
141                         for (it = parameters.begin(); it != parameters.end(); it++) { 
142                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
143                                         abort = true;
144                                 }
145                         }
146                         
147                         //initialize outputTypes
148                         vector<string> tempOutNames;
149                         outputTypes["list"] = tempOutNames;
150                         outputTypes["rabund"] = tempOutNames;
151                         outputTypes["sabund"] = tempOutNames;
152                         outputTypes["column"] = tempOutNames;
153                         
154                         //if the user changes the output directory command factory will send this info to us in the output parameter 
155                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
156                         
157                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
158                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
159                         if (inputDir == "not found"){   inputDir = "";          }
160                         else {
161                                 string path;
162                                 it = parameters.find("phylip");
163                                 //user has given a template file
164                                 if(it != parameters.end()){ 
165                                         path = m->hasPath(it->second);
166                                         //if the user has not given a path then, add inputdir. else leave path alone.
167                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
168                                 }
169                                 
170                                 it = parameters.find("column");
171                                 //user has given a template file
172                                 if(it != parameters.end()){ 
173                                         path = m->hasPath(it->second);
174                                         //if the user has not given a path then, add inputdir. else leave path alone.
175                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
176                                 }
177                                 
178                                 it = parameters.find("name");
179                                 //user has given a template file
180                                 if(it != parameters.end()){ 
181                                         path = m->hasPath(it->second);
182                                         //if the user has not given a path then, add inputdir. else leave path alone.
183                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
184                                 }
185                                 
186                                 it = parameters.find("taxonomy");
187                                 //user has given a template file
188                                 if(it != parameters.end()){ 
189                                         path = m->hasPath(it->second);
190                                         //if the user has not given a path then, add inputdir. else leave path alone.
191                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
192                                 }
193                                 
194                                 it = parameters.find("fasta");
195                                 //user has given a template file
196                                 if(it != parameters.end()){ 
197                                         path = m->hasPath(it->second);
198                                         //if the user has not given a path then, add inputdir. else leave path alone.
199                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
200                                 }
201                 
202                 it = parameters.find("count");
203                                 //user has given a template file
204                                 if(it != parameters.end()){ 
205                                         path = m->hasPath(it->second);
206                                         //if the user has not given a path then, add inputdir. else leave path alone.
207                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
208                                 }
209                         }
210                         
211                         //check for required parameters
212                         phylipfile = validParameter.validFile(parameters, "phylip", true);
213                         if (phylipfile == "not open") { abort = true; }
214                         else if (phylipfile == "not found") { phylipfile = ""; }        
215                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
216                         
217                         columnfile = validParameter.validFile(parameters, "column", true);
218                         if (columnfile == "not open") { abort = true; } 
219                         else if (columnfile == "not found") { columnfile = ""; }
220                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
221                         
222                         namefile = validParameter.validFile(parameters, "name", true);
223                         if (namefile == "not open") { abort = true; namefile = "";}     
224                         else if (namefile == "not found") { namefile = "";  }
225                         else { m->setNameFile(namefile); }
226             
227             countfile = validParameter.validFile(parameters, "count", true);
228                         if (countfile == "not open") { abort = true; countfile = "";}   
229                         else if (countfile == "not found") { countfile = "";  }
230                         else { m->setCountTableFile(countfile); }
231                         
232                         fastafile = validParameter.validFile(parameters, "fasta", true);
233                         if (fastafile == "not open") { abort = true; }  
234                         else if (fastafile == "not found") { fastafile = ""; }
235                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
236                         
237                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
238                         if (taxFile == "not open") { taxFile = ""; abort = true; }      
239                         else if (taxFile == "not found") { taxFile = ""; }
240                         else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
241                         
242                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { 
243                                 //is there are current file available for either of these?
244                                 //give priority to column, then phylip, then fasta
245                                 columnfile = m->getColumnFile(); 
246                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
247                                 else { 
248                                         phylipfile = m->getPhylipFile(); 
249                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
250                                         else { 
251                                                 fastafile = m->getFastaFile(); 
252                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
253                                                 else { 
254                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); 
255                                                         abort = true; 
256                                                 }
257                                         }
258                                 }
259                         }
260                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
261             
262             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
263             
264                         if (columnfile != "") {
265                                 if ((namefile == "") && (countfile == "")) { 
266                                         namefile = m->getNameFile(); 
267                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
268                                         else { 
269                                                 countfile = m->getCountTableFile();
270                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
271                         else { 
272                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine(); 
273                             abort = true; 
274                         }       
275                                         }       
276                                 }
277                         }
278                         
279                         if (fastafile != "") {
280                                 if (taxFile == "") { 
281                                         taxFile = m->getTaxonomyFile(); 
282                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
283                                         else { 
284                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine(); 
285                                                 abort = true; 
286                                         }       
287                                 }
288                                 
289                                 if ((namefile == "") && (countfile == "")) { 
290                                         namefile = m->getNameFile(); 
291                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
292                                         else { 
293                                                 countfile = m->getCountTableFile();
294                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
295                         else { 
296                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine(); 
297                             abort = true; 
298                         }       
299                                         }       
300                                 }
301                         }
302                                         
303                         //check for optional parameter and set defaults
304                         // ...at some point should added some additional type checking...
305                         //get user cutoff and precision or use defaults
306                         string temp;
307                         temp = validParameter.validFile(parameters, "precision", false);
308                         if (temp == "not found") { temp = "100"; }
309                         //saves precision legnth for formatting below
310                         length = temp.length();
311                         m->mothurConvert(temp, precision); 
312                         
313                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
314                         hard = m->isTrue(temp);
315                         
316                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
317                         large = m->isTrue(temp);
318             
319                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
320                         m->setProcessors(temp);
321                         m->mothurConvert(temp, processors);
322                         
323                         temp = validParameter.validFile(parameters, "splitmethod", false);      
324                         if ((splitmethod != "fasta") && (splitmethod != "classify")) {
325                                 if (temp == "not found")  { splitmethod = "distance"; }
326                                 else {  splitmethod = temp; }
327                         }
328                         
329             temp = validParameter.validFile(parameters, "classic", false);                      if (temp == "not found") { temp = "F"; }
330                         classic = m->isTrue(temp);
331             
332             if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
333
334                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
335                         m->mothurConvert(temp, cutoff); 
336                         cutoff += (5 / (precision * 10.0));  
337                         
338                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
339                         m->mothurConvert(temp, taxLevelCutoff); 
340                         
341                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
342                         
343                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
344                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
345                         
346                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
347                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
348                         
349                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
350
351                         showabund = validParameter.validFile(parameters, "showabund", false);
352                         if (showabund == "not found") { showabund = "T"; }
353
354                         timing = validParameter.validFile(parameters, "timing", false);
355                         if (timing == "not found") { timing = "F"; }
356                         
357                 }
358         }
359         catch(exception& e) {
360                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
361                 exit(1);
362         }
363 }
364
365 //**********************************************************************************************************************
366
367 int ClusterSplitCommand::execute(){
368         try {
369         
370                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
371                 
372                 time_t estart;
373                 vector<string> listFileNames;
374                 set<string> labels;
375                 string singletonName = "";
376                 double saveCutoff = cutoff;
377
378                 //****************** file prep work ******************************//
379                 #ifdef USE_MPI
380                         int pid;
381                         int tag = 2001;
382                         MPI_Status status; 
383                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
384                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
385                         
386                         if (pid == 0) { //only process 0 converts and splits
387                         
388                 #endif
389                 
390                 //if user gave a phylip file convert to column file
391                 if (format == "phylip") {
392                         estart = time(NULL);
393                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
394                         
395                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
396                         
397                         NameAssignment* nameMap = NULL;
398                         convert->setFormat("phylip");
399                         convert->read(nameMap);
400                         
401                         if (m->control_pressed) {  delete convert;  return 0;  }
402                         
403                         distfile = convert->getOutputFile();
404                 
405                         //if no names file given with phylip file, create it
406                         ListVector* listToMakeNameFile =  convert->getListVector();
407                         if ((namefile == "") && (countfile == "")) {  //you need to make a namefile for split matrix
408                                 ofstream out;
409                                 namefile = phylipfile + ".names";
410                                 m->openOutputFile(namefile, out);
411                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
412                                         string bin = listToMakeNameFile->get(i);
413                                         out << bin << '\t' << bin << endl;
414                                 }
415                                 out.close();
416                         }
417                         delete listToMakeNameFile;
418                         delete convert;
419                         
420                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
421                 }
422                 if (m->control_pressed) { return 0; }
423                 
424                 estart = time(NULL);
425                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
426                 
427                 //split matrix into non-overlapping groups
428                 SplitMatrix* split;
429                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large);                                                    }
430                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large);                                    }
431                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);  }
432                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
433                 
434                 split->split();
435                 
436                 if (m->control_pressed) { delete split; return 0; }
437                 
438                 singletonName = split->getSingletonNames();
439                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
440                 delete split;
441                 
442                 //output a merged distance file
443                 if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
444                         
445                                 
446                 if (m->control_pressed) { return 0; }
447                 
448                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
449                 estart = time(NULL);
450                 
451                 //****************** break up files between processes and cluster each file set ******************************//
452         #ifdef USE_MPI
453                         ////you are process 0 from above////
454                         
455                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...                           
456                         dividedNames.resize(processors);
457                                         
458                         //for each file group figure out which process will complete it
459                         //want to divide the load intelligently so the big files are spread between processes
460                         for (int i = 0; i < distName.size(); i++) { 
461                                 int processToAssign = (i+1) % processors; 
462                                 if (processToAssign == 0) { processToAssign = processors; }
463                                                 
464                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
465                         }
466                                         
467                         //not lets reverse the order of ever other process, so we balance big files running with little ones
468                         for (int i = 0; i < processors; i++) {
469                                 int remainder = ((i+1) % processors);
470                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
471                         }
472                         
473                         
474                         //send each child the list of files it needs to process
475                         for(int i = 1; i < processors; i++) { 
476                                 //send number of file pairs
477                                 int num = dividedNames[i].size();
478                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
479                                 
480                                 for (int j = 0; j < num; j++) { //send filenames to process i
481                                         char tempDistFileName[1024];
482                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
483                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
484                                         
485                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
486                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
487                                         
488                                         char tempNameFileName[1024];
489                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
490                                         int lengthName = (dividedNames[i][j].begin()->second).length();
491
492                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
493                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
494                                 }
495                         }
496                         
497                         //process your share
498                         listFileNames = cluster(dividedNames[0], labels);
499                         
500                         //receive the other processes info
501                         for(int i = 1; i < processors; i++) { 
502                                 int num = dividedNames[i].size();
503                                 
504                                 double tempCutoff;
505                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
506                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
507                                 
508                                 //send list filenames to root process
509                                 for (int j = 0; j < num; j++) {  
510                                         int lengthList = 0;
511                                         char tempListFileName[1024];
512                                 
513                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
514                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
515                                 
516                                         string myListFileName = tempListFileName;
517                                         myListFileName = myListFileName.substr(0, lengthList);
518                                         
519                                         listFileNames.push_back(myListFileName);
520                                 }
521                                 
522                                 //send Labels to root process
523                                 int numLabels = 0;
524                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
525                                 
526                                 for (int j = 0; j < numLabels; j++) {  
527                                         int lengthLabel = 0;
528                                         char tempLabel[100];
529                                 
530                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
531                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
532                                 
533                                         string myLabel = tempLabel;
534                                         myLabel = myLabel.substr(0, lengthLabel);
535                         
536                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
537                                 }
538                         }
539                         
540                 }else { //you are a child process
541                         vector < map<string, string> >  myNames;
542                         
543                         //recieve the files you need to process
544                         //receive number of file pairs
545                         int num = 0;
546                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
547                         
548                         myNames.resize(num);
549         
550                         for (int j = 0; j < num; j++) { //receive filenames to process 
551                                 int lengthDist = 0;
552                                 char tempDistFileName[1024];
553                                 
554                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
555                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
556                                 
557                                 string myDistFileName = tempDistFileName;
558                                 myDistFileName = myDistFileName.substr(0, lengthDist);
559                         
560                                 int lengthName = 0;
561                                 char tempNameFileName[1024];
562                                 
563                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
564                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
565                                 
566                                 string myNameFileName = tempNameFileName;
567                                 myNameFileName = myNameFileName.substr(0, lengthName);
568                                 
569                                 //save file name
570                                 myNames[j][myDistFileName] = myNameFileName;
571                         }
572         
573                         //process them
574                         listFileNames = cluster(myNames, labels);
575                         
576                         //send cutoff
577                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
578                         
579                         //send list filenames to root process
580                         for (int j = 0; j < num; j++) {  
581                                 char tempListFileName[1024];
582                                 strcpy(tempListFileName, listFileNames[j].c_str());
583                                 int lengthList = listFileNames[j].length();
584                                         
585                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
586                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
587                         }
588                         
589                         //send Labels to root process
590                         int numLabels = labels.size();
591                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
592                         
593                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
594                                 char tempLabel[100];
595                                 strcpy(tempLabel, (*it).c_str());
596                                 int lengthLabel = (*it).length();
597                                         
598                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
599                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
600                         }
601                 }
602                 
603                 //make everyone wait
604                 MPI_Barrier(MPI_COMM_WORLD);
605                 
606         #else
607                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
608                 //sanity check
609                 if (processors > distName.size()) { processors = distName.size(); }
610                 
611                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
612                                 if(processors == 1){
613                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
614                                 }else{
615                                         listFileNames = createProcesses(distName, labels);
616                 }
617                 #else
618                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
619                 #endif
620         #endif  
621                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
622                 
623                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
624                 
625                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
626                 
627                 //****************** merge list file and create rabund and sabund files ******************************//
628                 estart = time(NULL);
629                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
630                 
631                 #ifdef USE_MPI
632                         if (pid == 0) { //only process 0 merges
633                 #endif
634
635                 ListVector* listSingle;
636                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
637                 
638                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
639                 
640                 mergeLists(listFileNames, labelBins, listSingle);
641
642                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
643                 
644                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
645                 
646                 //set list file as new current listfile
647                 string current = "";
648                 itTypes = outputTypes.find("list");
649                 if (itTypes != outputTypes.end()) {
650                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
651                 }
652                 
653                 //set rabund file as new current rabundfile
654                 itTypes = outputTypes.find("rabund");
655                 if (itTypes != outputTypes.end()) {
656                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
657                 }
658                 
659                 //set sabund file as new current sabundfile
660                 itTypes = outputTypes.find("sabund");
661                 if (itTypes != outputTypes.end()) {
662                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
663                 }
664                                 
665                 m->mothurOutEndLine();
666                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
667                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
668                 m->mothurOutEndLine();
669                 
670                 #ifdef USE_MPI
671                         } //only process 0 merges
672                         
673                         //make everyone wait
674                         MPI_Barrier(MPI_COMM_WORLD);
675                 #endif
676
677                 return 0;
678         }
679         catch(exception& e) {
680                 m->errorOut(e, "ClusterSplitCommand", "execute");
681                 exit(1);
682         }
683 }
684 //**********************************************************************************************************************
685 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
686         try {
687                                 
688                 map<float, int> labelBin;
689                 vector<float> orderFloat;
690                 int numSingleBins;
691                 
692                 //read in singletons
693                 if (singleton != "none") {
694             
695             ifstream in;
696             m->openInputFile(singleton, in);
697                                 
698                         string firstCol, secondCol;
699                         listSingle = new ListVector();
700             
701             if (countfile != "") { m->getline(in); m->gobble(in); }
702             
703                         while (!in.eof()) {
704                                 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
705                                 if (countfile == "") { listSingle->push_back(secondCol); }
706                 else { listSingle->push_back(firstCol); }
707                         }
708             
709                         in.close();
710                         m->mothurRemove(singleton);
711                         
712                         numSingleBins = listSingle->getNumBins();
713                 }else{  listSingle = NULL; numSingleBins = 0;  }
714                 
715                 //go through users set and make them floats so we can sort them 
716                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
717                         float temp = -10.0;
718
719                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
720                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
721                         
722                         if (temp <= cutoff) {
723                                 orderFloat.push_back(temp);
724                                 labelBin[temp] = numSingleBins; //initialize numbins 
725                         }
726                 }
727         
728                 //sort order
729                 sort(orderFloat.begin(), orderFloat.end());
730                 userLabels.clear();
731                         
732                 //get the list info from each file
733                 for (int k = 0; k < listNames.size(); k++) {
734         
735                         if (m->control_pressed) {  
736                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
737                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
738                                 return labelBin;
739                         }
740                         
741                         InputData* input = new InputData(listNames[k], "list");
742                         ListVector* list = input->getListVector();
743                         string lastLabel = list->getLabel();
744                         
745                         string filledInList = listNames[k] + "filledInTemp";
746                         ofstream outFilled;
747                         m->openOutputFile(filledInList, outFilled);
748         
749                         //for each label needed
750                         for(int l = 0; l < orderFloat.size(); l++){
751                         
752                                 string thisLabel;
753                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
754                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
755
756                                 //this file has reached the end
757                                 if (list == NULL) { 
758                                         list = input->getListVector(lastLabel, true); 
759                                 }else{  //do you have the distance, or do you need to fill in
760                                                 
761                                         float labelFloat;
762                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
763                                         else { convert(list->getLabel(), labelFloat); }
764
765                                         //check for missing labels
766                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
767                                                 //if its bigger get last label, otherwise keep it
768                                                 delete list;
769                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
770                                         }
771                                         lastLabel = list->getLabel();
772                                 }
773                                 
774                                 //print to new file
775                                 list->setLabel(thisLabel);
776                                 list->print(outFilled);
777                 
778                                 //update labelBin
779                                 labelBin[orderFloat[l]] += list->getNumBins();
780                                                                         
781                                 delete list;
782                                                                         
783                                 list = input->getListVector();
784                         }
785                         
786                         if (list != NULL) { delete list; }
787                         delete input;
788                         
789                         outFilled.close();
790                         m->mothurRemove(listNames[k]);
791                         rename(filledInList.c_str(), listNames[k].c_str());
792                 }
793                 
794                 return labelBin;
795         }
796         catch(exception& e) {
797                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
798                 exit(1);
799         }
800 }
801 //**********************************************************************************************************************
802 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
803         try {
804                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
805                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
806                 
807         string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund");
808         string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund");
809         string listFileName = fileroot+ tag + ".";
810         if (countfile != "") { listFileName += "unique_"; }
811         listFileName += getOutputFileNameTag("list");
812         
813         if (countfile == "") {
814             m->openOutputFile(sabundFileName,   outSabund);
815             m->openOutputFile(rabundFileName,   outRabund);
816             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
817             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
818             
819         }
820                 m->openOutputFile(listFileName, outList);
821         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
822                 
823                 
824                 map<float, int>::iterator itLabel;
825
826                 //for each label needed
827                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
828                         
829                         string thisLabel;
830                         if (itLabel->first == -1) { thisLabel = "unique"; }
831                         else { thisLabel = toString(itLabel->first,  length-1);  } 
832                         
833                         outList << thisLabel << '\t' << itLabel->second << '\t';
834             
835             RAbundVector* rabund = NULL;
836             if (countfile == "") {
837                 rabund = new RAbundVector();
838                 rabund->setLabel(thisLabel);
839             }
840
841                         //add in singletons
842                         if (listSingle != NULL) {
843                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
844                                         outList << listSingle->get(j) << '\t';
845                                         if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
846                                 }
847                         }
848                         
849                         //get the list info from each file
850                         for (int k = 0; k < listNames.size(); k++) {
851         
852                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } if (rabund != NULL) { delete rabund; } return 0; }
853                                 
854                                 InputData* input = new InputData(listNames[k], "list");
855                                 ListVector* list = input->getListVector(thisLabel);
856                                 
857                                 //this file has reached the end
858                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
859                                 else {          
860                                         for (int j = 0; j < list->getNumBins(); j++) {
861                                                 outList << list->get(j) << '\t';
862                                                 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
863                                         }
864                                         delete list;
865                                 }
866                                 delete input;
867                         }
868                         
869             if (countfile == "") {
870                 SAbundVector sabund = rabund->getSAbundVector();
871                 sabund.print(outSabund);
872                 rabund->print(outRabund);
873             }
874                         outList << endl;
875                         
876                         if (rabund != NULL) { delete rabund; }
877                 }
878                 
879                 outList.close();
880         if (countfile == "") {
881             outRabund.close();
882             outSabund.close();
883                 }
884                 if (listSingle != NULL) { delete listSingle;  }
885                 
886                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
887                 
888                 return 0;
889         }
890         catch(exception& e) {
891                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
892                 exit(1);
893         }
894 }
895
896 //**********************************************************************************************************************
897
898 void ClusterSplitCommand::printData(ListVector* oldList){
899         try {
900                 string label = oldList->getLabel();
901                 RAbundVector oldRAbund = oldList->getRAbundVector();
902                 
903                 oldRAbund.setLabel(label);
904                 if (m->isTrue(showabund)) {
905                         oldRAbund.getSAbundVector().print(cout);
906                 }
907                 oldRAbund.print(outRabund);
908                 oldRAbund.getSAbundVector().print(outSabund);
909         
910                 oldList->print(outList);
911         }
912         catch(exception& e) {
913                 m->errorOut(e, "ClusterSplitCommand", "printData");
914                 exit(1);
915         }
916 }
917 //**********************************************************************************************************************
918 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
919         try {
920         
921         vector<string> listFiles;
922         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
923         dividedNames.resize(processors);
924         
925         //for each file group figure out which process will complete it
926         //want to divide the load intelligently so the big files are spread between processes
927         for (int i = 0; i < distName.size(); i++) { 
928             //cout << i << endl;
929             int processToAssign = (i+1) % processors; 
930             if (processToAssign == 0) { processToAssign = processors; }
931             
932             dividedNames[(processToAssign-1)].push_back(distName[i]);
933             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
934         }
935         
936         //not lets reverse the order of ever other process, so we balance big files running with little ones
937         for (int i = 0; i < processors; i++) {
938             //cout << i << endl;
939             int remainder = ((i+1) % processors);
940             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
941         }
942         
943         if (m->control_pressed) { return listFiles; }
944         
945         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
946                 int process = 1;
947                 processIDS.clear();
948                 
949                 //loop through and create all the processes you want
950                 while (process != processors) {
951                         int pid = fork();
952                         
953                         if (pid > 0) {
954                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
955                                 process++;
956                         }else if (pid == 0){
957                                 set<string> labels;
958                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
959                                 
960                                 //write out names to file
961                                 string filename = toString(getpid()) + ".temp";
962                                 ofstream out;
963                                 m->openOutputFile(filename, out);
964                                 out << tag << endl;
965                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
966                                 out.close();
967                                 
968                                 //print out labels
969                                 ofstream outLabels;
970                                 filename = toString(getpid()) + ".temp.labels";
971                                 m->openOutputFile(filename, outLabels);
972                                 
973                                 outLabels << cutoff << endl;
974                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
975                                         outLabels << (*it) << endl;
976                                 }
977                                 outLabels.close();
978
979                                 exit(0);
980                         }else { 
981                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
982                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
983                                 exit(0);
984                         }
985                 }
986                 
987         //do your part
988         listFiles = cluster(dividedNames[0], labels);
989         
990                 //force parent to wait until all the processes are done
991                 for (int i=0;i< processIDS.size();i++) { 
992                         int temp = processIDS[i];
993                         wait(&temp);
994                 }
995         
996         //get list of list file names from each process
997         for(int i=0;i<processIDS.size();i++){
998             string filename = toString(processIDS[i]) + ".temp";
999             ifstream in;
1000             m->openInputFile(filename, in);
1001             
1002             in >> tag; m->gobble(in);
1003             
1004             while(!in.eof()) {
1005                 string tempName;
1006                 in >> tempName; m->gobble(in);
1007                 listFiles.push_back(tempName);
1008             }
1009             in.close();
1010             m->mothurRemove((toString(processIDS[i]) + ".temp"));
1011             
1012             //get labels
1013             filename = toString(processIDS[i]) + ".temp.labels";
1014             ifstream in2;
1015             m->openInputFile(filename, in2);
1016             
1017             float tempCutoff;
1018             in2 >> tempCutoff; m->gobble(in2);
1019             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1020             
1021             while(!in2.eof()) {
1022                 string tempName;
1023                 in2 >> tempName; m->gobble(in2);
1024                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1025             }
1026             in2.close();
1027             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1028         }
1029         
1030
1031     #else
1032        
1033         //////////////////////////////////////////////////////////////////////////////////////////////////////
1034                 //Windows version shared memory, so be careful when passing variables through the clusterData struct. 
1035                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1036                 //Taking advantage of shared memory to allow both threads to add labels.
1037                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1038                 /*
1039                 vector<clusterData*> pDataArray; 
1040                 DWORD   dwThreadIdArray[processors-1];
1041                 HANDLE  hThreadArray[processors-1]; 
1042                 
1043                 //Create processor worker threads.
1044                 for( int i=1; i<processors; i++ ){
1045                         // Allocate memory for thread data.
1046                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1047                         pDataArray.push_back(tempCluster);
1048                         processIDS.push_back(i);
1049             
1050                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1051                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1052                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);  
1053             
1054                 }
1055         
1056         //do your part
1057         listFiles = cluster(dividedNames[0], labels);
1058         
1059                 //Wait until all threads have terminated.
1060                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1061                 
1062                 //Close all thread handles and free memory allocations.
1063                 for(int i=0; i < pDataArray.size(); i++){
1064             //get tag
1065             tag = pDataArray[i]->tag;
1066             //get listfiles created
1067             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1068             //get labels
1069             set<string>::iterator it;
1070             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1071                         //check cutoff
1072             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1073                         CloseHandle(hThreadArray[i]);
1074                         delete pDataArray[i];
1075                 }
1076 */
1077         #endif          
1078         
1079         return listFiles;
1080         
1081         }
1082         catch(exception& e) {
1083                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1084                 exit(1);
1085         }
1086 }
1087 //**********************************************************************************************************************
1088
1089 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1090         try {
1091                 
1092                 vector<string> listFileNames;
1093                 double smallestCutoff = cutoff;
1094                 
1095                 //cluster each distance file
1096                 for (int i = 0; i < distNames.size(); i++) {
1097             
1098                         string thisNamefile = distNames[i].begin()->second;
1099                         string thisDistFile = distNames[i].begin()->first;
1100                         
1101                         string listFileName = "";
1102             if (classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff);   }
1103             else            {  listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff);          }
1104
1105                         if (m->control_pressed) { //clean up
1106                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1107                                 listFileNames.clear(); return listFileNames;
1108                         }
1109             
1110             listFileNames.push_back(listFileName);
1111         }
1112                 
1113                 cutoff = smallestCutoff;
1114                                         
1115                 return listFileNames;
1116         
1117         }
1118         catch(exception& e) {
1119                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1120                 exit(1);
1121         }
1122
1123
1124 }
1125 //**********************************************************************************************************************
1126 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1127         try {
1128         string listFileName = "";
1129         
1130         ListVector* list = NULL;
1131         ListVector oldList;
1132         RAbundVector* rabund = NULL;
1133         
1134 #ifdef USE_MPI
1135         int pid;
1136         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1137         
1138         //output your files too
1139         if (pid != 0) {
1140             cout << endl << "Reading " << thisDistFile << endl;
1141         }
1142 #endif
1143
1144         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1145         
1146         //reads phylip file storing data in 2D vector, also fills list and rabund
1147         bool sim = false;
1148                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1149         
1150         NameAssignment* nameMap = NULL;
1151         CountTable* ct = NULL;
1152         if(namefile != ""){     
1153                         nameMap = new NameAssignment(thisNamefile);
1154                         nameMap->readMap();
1155             cluster->readPhylipFile(thisDistFile, nameMap);
1156                 }else if (countfile != "") {
1157             ct = new CountTable();
1158             ct->readTable(thisNamefile);
1159             cluster->readPhylipFile(thisDistFile, ct);
1160         }
1161         tag = cluster->getTag();
1162         
1163                 if (m->control_pressed) { if(namefile != ""){   delete nameMap; }
1164             else { delete ct; } delete cluster; return 0; }
1165                 
1166                 list = cluster->getListVector();
1167                 rabund = cluster->getRAbundVector();
1168         
1169                 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1170                 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1171         listFileName = fileroot+ tag + ".list";
1172         
1173         ofstream listFile;
1174                 m->openOutputFile(fileroot+ tag + ".list",      listFile);
1175                 
1176                 float previousDist = 0.00000;
1177                 float rndPreviousDist = 0.00000;
1178                 oldList = *list;
1179                 
1180 #ifdef USE_MPI
1181         //output your files too
1182         if (pid != 0) {
1183             cout << endl << "Clustering " << thisDistFile << endl;
1184         }
1185 #endif
1186
1187         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1188         
1189                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1190                         if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  if(namefile != ""){    delete nameMap; }
1191                 else { delete ct; } return listFileName;  }
1192             
1193                         cluster->update(cutoff);
1194             
1195                         float dist = cluster->getSmallDist();
1196                         float rndDist;
1197                         if (hard) {
1198                                 rndDist = m->ceilDist(dist, precision); 
1199                         }else{
1200                                 rndDist = m->roundDist(dist, precision); 
1201                         }
1202             
1203             if(previousDist <= 0.0000 && dist != previousDist){
1204                 oldList.setLabel("unique");
1205                 oldList.print(listFile);
1206                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1207             }
1208             else if(rndDist != rndPreviousDist){
1209                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1210                 oldList.print(listFile);
1211                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1212             }
1213
1214             
1215                         previousDist = dist;
1216                         rndPreviousDist = rndDist;
1217                         oldList = *list;
1218                 }
1219         
1220                 if(previousDist <= 0.0000){
1221             oldList.setLabel("unique");
1222             oldList.print(listFile);
1223             if (labels.count("unique") == 0) { labels.insert("unique"); }
1224         }
1225         else if(rndPreviousDist<cutoff){
1226             oldList.setLabel(toString(rndPreviousDist,  length-1));
1227             oldList.print(listFile);
1228             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1229         }
1230
1231         
1232                 listFile.close();
1233                 
1234                 delete cluster;  delete list; delete rabund;
1235         if(namefile != ""){     delete nameMap; }
1236         else { delete ct; }
1237         
1238         m->mothurRemove(thisDistFile);
1239         m->mothurRemove(thisNamefile);
1240         
1241         return listFileName;
1242         
1243         }
1244         catch(exception& e) {
1245                 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1246                 exit(1);
1247         }
1248 }
1249
1250 //**********************************************************************************************************************
1251 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1252         try {
1253         string listFileName = "";
1254         
1255         Cluster* cluster = NULL;
1256         SparseDistanceMatrix* matrix = NULL;
1257         ListVector* list = NULL;
1258         ListVector oldList;
1259         RAbundVector* rabund = NULL;
1260         
1261         if (m->control_pressed) { return listFileName; }
1262         
1263 #ifdef USE_MPI
1264         int pid;
1265         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1266         
1267         //output your files too
1268         if (pid != 0) {
1269             cout << endl << "Reading " << thisDistFile << endl;
1270         }
1271 #endif
1272         
1273         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1274         
1275         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
1276         read->setCutoff(cutoff);
1277         
1278         NameAssignment* nameMap = NULL;
1279         CountTable* ct = NULL;
1280                 if(namefile != ""){     
1281                         nameMap = new NameAssignment(thisNamefile);
1282                         nameMap->readMap();
1283             read->read(nameMap);
1284                 }else if (countfile != "") {
1285             ct = new CountTable();
1286             ct->readTable(thisNamefile);
1287             read->read(ct);
1288         }
1289                 
1290                 list = read->getListVector();
1291         oldList = *list;
1292                 matrix = read->getDMatrix();
1293         
1294                 if(countfile != "") {
1295             rabund = new RAbundVector();
1296             createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1297             delete ct;
1298         }else { rabund = new RAbundVector(list->getRAbundVector()); }
1299
1300         delete read;  read = NULL;
1301         if (namefile != "") { delete nameMap; nameMap = NULL; }
1302         
1303         
1304 #ifdef USE_MPI
1305         //output your files too
1306         if (pid != 0) {
1307             cout << endl << "Clustering " << thisDistFile << endl;
1308         }
1309 #endif
1310         
1311         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1312                 
1313         //create cluster
1314         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1315         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1316         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
1317         tag = cluster->getTag();
1318                 
1319         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1320         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1321         
1322         ofstream listFile;
1323         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1324                 
1325         listFileName = fileroot+ tag + ".list";
1326         
1327         float previousDist = 0.00000;
1328         float rndPreviousDist = 0.00000;
1329         
1330         oldList = *list;
1331         
1332         print_start = true;
1333         start = time(NULL);
1334         double saveCutoff = cutoff;
1335                 
1336         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1337             
1338             if (m->control_pressed) { //clean up
1339                 delete matrix; delete list;     delete cluster; delete rabund;
1340                 listFile.close();
1341                 m->mothurRemove(listFileName);  
1342                 return listFileName;
1343             }
1344             
1345             cluster->update(saveCutoff);
1346             
1347             float dist = matrix->getSmallDist();
1348             float rndDist;
1349             if (hard) {
1350                 rndDist = m->ceilDist(dist, precision); 
1351             }else{
1352                 rndDist = m->roundDist(dist, precision); 
1353             }
1354             
1355             if(previousDist <= 0.0000 && dist != previousDist){
1356                 oldList.setLabel("unique");
1357                 oldList.print(listFile);
1358                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1359             }
1360             else if(rndDist != rndPreviousDist){
1361                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1362                 oldList.print(listFile);
1363                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1364             }
1365             
1366             previousDist = dist;
1367             rndPreviousDist = rndDist;
1368             oldList = *list;
1369         }
1370         
1371                 
1372         if(previousDist <= 0.0000){
1373             oldList.setLabel("unique");
1374             oldList.print(listFile);
1375             if (labels.count("unique") == 0) { labels.insert("unique"); }
1376         }
1377         else if(rndPreviousDist<cutoff){
1378             oldList.setLabel(toString(rndPreviousDist,  length-1));
1379             oldList.print(listFile);
1380             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1381         }
1382         
1383         delete matrix; delete list;     delete cluster; delete rabund; 
1384         matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1385         listFile.close();
1386         
1387         if (m->control_pressed) { //clean up
1388             m->mothurRemove(listFileName);      
1389             return listFileName;
1390         }
1391         
1392         m->mothurRemove(thisDistFile);
1393         m->mothurRemove(thisNamefile);
1394         
1395         if (saveCutoff != cutoff) { 
1396             if (hard)   {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1397             else                {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1398                         
1399             m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  
1400         }
1401         
1402         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1403         
1404         return listFileName;
1405         
1406         }
1407         catch(exception& e) {
1408                 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1409                 exit(1);
1410         }
1411 }
1412 //**********************************************************************************************************************
1413
1414 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1415         try{
1416                 
1417 #ifdef USE_MPI
1418                 int pid;
1419                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1420                 
1421                 if (pid != 0) {
1422 #endif
1423                 
1424                 string thisOutputDir = outputDir;
1425                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1426                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("column");
1427                 m->mothurRemove(outputFileName);
1428                 
1429                 
1430                 for (int i = 0; i < distNames.size(); i++) {
1431                         if (m->control_pressed) {  return 0; }
1432                         
1433                         string thisDistFile = distNames[i].begin()->first;
1434                         
1435                         m->appendFiles(thisDistFile, outputFileName);
1436                 }       
1437                         
1438                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1439                         
1440 #ifdef USE_MPI
1441                 }
1442 #endif
1443                                 
1444                 return 0;       
1445                 
1446                 
1447         }
1448         catch(exception& e) {
1449                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1450                 exit(1);
1451         }
1452 }
1453 //**********************************************************************************************************************
1454 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1455     try {
1456         rabund->setLabel(list->getLabel());        
1457         for(int i = 0; i < list->getNumBins(); i++) { 
1458             if (m->control_pressed) { break; }
1459             vector<string> binNames;
1460             string bin = list->get(i);
1461             m->splitAtComma(bin, binNames);
1462             int total = 0;
1463             for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
1464             rabund->push_back(total);   
1465         }
1466         return 0;
1467     }
1468     catch(exception& e) {
1469                 m->errorOut(e, "ClusterCommand", "createRabund");
1470                 exit(1);
1471         }
1472     
1473 }
1474 //**********************************************************************************************************************