]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
Merge remote-tracking branch 'mothur/master'
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11
12
13 //**********************************************************************************************************************
14 vector<string> ClusterSplitCommand::setParameters(){    
15         try {
16                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
17                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
18                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
19                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
20                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
21                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
22                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
23                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
24                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
25                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
26                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
27                 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff);
28                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
29                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
30                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
31         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pclassic);
32                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
33                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
34                         
35                 vector<string> myArray;
36                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
37                 return myArray;
38         }
39         catch(exception& e) {
40                 m->errorOut(e, "ClusterSplitCommand", "setParameters");
41                 exit(1);
42         }
43 }
44 //**********************************************************************************************************************
45 string ClusterSplitCommand::getHelpString(){    
46         try {
47                 string helpString = "";
48                 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n";
49                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
50                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
51                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
52                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
53                 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n";
54                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
55                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
56                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
57                 helpString += "The name parameter allows you to enter your name file and is required if your distance file is in column format. \n";
58                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
59                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
60                 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
61                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
62                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
63                 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
64                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
65         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic.  It is only valid with splitmethod=fasta. Default=f.\n";
66 #ifdef USE_MPI
67                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
68 #endif
69                 helpString += "The cluster.split command should be in the following format: \n";
70                 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
71                 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";       
72                 return helpString;
73         }
74         catch(exception& e) {
75                 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
76                 exit(1);
77         }
78 }
79 //**********************************************************************************************************************
80 string ClusterSplitCommand::getOutputFileNameTag(string type, string inputName=""){     
81         try {
82         string outputFileName = "";
83                 map<string, vector<string> >::iterator it;
84         
85         //is this a type this command creates
86         it = outputTypes.find(type);
87         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
88         else {
89             if (type == "list") {  outputFileName =  "list"; }
90             else if (type == "rabund") {  outputFileName =  "rabund"; }
91             else if (type == "sabund") {  outputFileName =  "sabund"; }
92             else if (type == "column") {  outputFileName =  "dist"; }
93             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
94         }
95         return outputFileName;
96         }
97         catch(exception& e) {
98                 m->errorOut(e, "ClusterSplitCommand", "getOutputFileNameTag");
99                 exit(1);
100         }
101 }
102 //**********************************************************************************************************************
103 ClusterSplitCommand::ClusterSplitCommand(){     
104         try {
105                 abort = true; calledHelp = true; 
106                 setParameters();
107                 vector<string> tempOutNames;
108                 outputTypes["list"] = tempOutNames;
109                 outputTypes["rabund"] = tempOutNames;
110                 outputTypes["sabund"] = tempOutNames;
111                 outputTypes["column"] = tempOutNames;
112         }
113         catch(exception& e) {
114                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
115                 exit(1);
116         }
117 }
118 //**********************************************************************************************************************
119 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
120 ClusterSplitCommand::ClusterSplitCommand(string option)  {
121         try{
122                 abort = false; calledHelp = false;   
123                 format = "";
124                 
125                 //allow user to run help
126                 if(option == "help") { help(); abort = true; calledHelp = true; }
127                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
128                 
129                 else {
130                         vector<string> myArray = setParameters();
131                         
132                         OptionParser parser(option);
133                         map<string,string> parameters = parser.getParameters();
134                         
135                         ValidParameters validParameter("cluster.split");
136                 
137                         //check to make sure all parameters are valid for command
138                         map<string,string>::iterator it;
139                         for (it = parameters.begin(); it != parameters.end(); it++) { 
140                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
141                                         abort = true;
142                                 }
143                         }
144                         
145                         //initialize outputTypes
146                         vector<string> tempOutNames;
147                         outputTypes["list"] = tempOutNames;
148                         outputTypes["rabund"] = tempOutNames;
149                         outputTypes["sabund"] = tempOutNames;
150                         outputTypes["column"] = tempOutNames;
151                         
152                         //if the user changes the output directory command factory will send this info to us in the output parameter 
153                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
154                         
155                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
156                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
157                         if (inputDir == "not found"){   inputDir = "";          }
158                         else {
159                                 string path;
160                                 it = parameters.find("phylip");
161                                 //user has given a template file
162                                 if(it != parameters.end()){ 
163                                         path = m->hasPath(it->second);
164                                         //if the user has not given a path then, add inputdir. else leave path alone.
165                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
166                                 }
167                                 
168                                 it = parameters.find("column");
169                                 //user has given a template file
170                                 if(it != parameters.end()){ 
171                                         path = m->hasPath(it->second);
172                                         //if the user has not given a path then, add inputdir. else leave path alone.
173                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
174                                 }
175                                 
176                                 it = parameters.find("name");
177                                 //user has given a template file
178                                 if(it != parameters.end()){ 
179                                         path = m->hasPath(it->second);
180                                         //if the user has not given a path then, add inputdir. else leave path alone.
181                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
182                                 }
183                                 
184                                 it = parameters.find("taxonomy");
185                                 //user has given a template file
186                                 if(it != parameters.end()){ 
187                                         path = m->hasPath(it->second);
188                                         //if the user has not given a path then, add inputdir. else leave path alone.
189                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
190                                 }
191                                 
192                                 it = parameters.find("fasta");
193                                 //user has given a template file
194                                 if(it != parameters.end()){ 
195                                         path = m->hasPath(it->second);
196                                         //if the user has not given a path then, add inputdir. else leave path alone.
197                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
198                                 }
199                         }
200                         
201                         //check for required parameters
202                         phylipfile = validParameter.validFile(parameters, "phylip", true);
203                         if (phylipfile == "not open") { abort = true; }
204                         else if (phylipfile == "not found") { phylipfile = ""; }        
205                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
206                         
207                         columnfile = validParameter.validFile(parameters, "column", true);
208                         if (columnfile == "not open") { abort = true; } 
209                         else if (columnfile == "not found") { columnfile = ""; }
210                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
211                         
212                         namefile = validParameter.validFile(parameters, "name", true);
213                         if (namefile == "not open") { abort = true; }   
214                         else if (namefile == "not found") { namefile = "";  }
215                         else { m->setNameFile(namefile); }
216                         
217                         fastafile = validParameter.validFile(parameters, "fasta", true);
218                         if (fastafile == "not open") { abort = true; }  
219                         else if (fastafile == "not found") { fastafile = ""; }
220                         else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
221                         
222                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
223                         if (taxFile == "not open") { taxFile = ""; abort = true; }      
224                         else if (taxFile == "not found") { taxFile = ""; }
225                         else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
226                         
227                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { 
228                                 //is there are current file available for either of these?
229                                 //give priority to column, then phylip, then fasta
230                                 columnfile = m->getColumnFile(); 
231                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
232                                 else { 
233                                         phylipfile = m->getPhylipFile(); 
234                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
235                                         else { 
236                                                 fastafile = m->getFastaFile(); 
237                                                 if (fastafile != "") {  m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
238                                                 else { 
239                                                         m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); 
240                                                         abort = true; 
241                                                 }
242                                         }
243                                 }
244                         }
245                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
246                 
247                         if (columnfile != "") {
248                                 if (namefile == "") { 
249                                         namefile = m->getNameFile(); 
250                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
251                                         else { 
252                                                 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); 
253                                                 abort = true; 
254                                         }       
255                                 }
256                         }
257                         
258                         if (fastafile != "") {
259                                 if (taxFile == "") { 
260                                         taxFile = m->getTaxonomyFile(); 
261                                         if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
262                                         else { 
263                                                 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine(); 
264                                                 abort = true; 
265                                         }       
266                                 }
267                                 
268                                 if (namefile == "") { 
269                                         namefile = m->getNameFile(); 
270                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
271                                         else { 
272                                                 m->mothurOut("You need to provide a namefile if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine(); 
273                                                 abort = true; 
274                                         }       
275                                 }
276                         }
277                                         
278                         //check for optional parameter and set defaults
279                         // ...at some point should added some additional type checking...
280                         //get user cutoff and precision or use defaults
281                         string temp;
282                         temp = validParameter.validFile(parameters, "precision", false);
283                         if (temp == "not found") { temp = "100"; }
284                         //saves precision legnth for formatting below
285                         length = temp.length();
286                         m->mothurConvert(temp, precision); 
287                         
288                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
289                         hard = m->isTrue(temp);
290                         
291                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
292                         large = m->isTrue(temp);
293             
294                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
295                         m->setProcessors(temp);
296                         m->mothurConvert(temp, processors);
297                         
298                         temp = validParameter.validFile(parameters, "splitmethod", false);      
299                         if ((splitmethod != "fasta") && (splitmethod != "classify")) {
300                                 if (temp == "not found")  { splitmethod = "distance"; }
301                                 else {  splitmethod = temp; }
302                         }
303                         
304             temp = validParameter.validFile(parameters, "classic", false);                      if (temp == "not found") { temp = "F"; }
305                         classic = m->isTrue(temp);
306             
307             if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
308
309                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
310                         m->mothurConvert(temp, cutoff); 
311                         cutoff += (5 / (precision * 10.0));  
312                         
313                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
314                         m->mothurConvert(temp, taxLevelCutoff); 
315                         
316                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
317                         
318                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
319                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
320                         
321                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
322                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
323                         
324                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
325
326                         showabund = validParameter.validFile(parameters, "showabund", false);
327                         if (showabund == "not found") { showabund = "T"; }
328
329                         timing = validParameter.validFile(parameters, "timing", false);
330                         if (timing == "not found") { timing = "F"; }
331                         
332                 }
333         }
334         catch(exception& e) {
335                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
336                 exit(1);
337         }
338 }
339
340 //**********************************************************************************************************************
341
342 int ClusterSplitCommand::execute(){
343         try {
344         
345                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
346                 
347                 time_t estart;
348                 vector<string> listFileNames;
349                 set<string> labels;
350                 string singletonName = "";
351                 double saveCutoff = cutoff;
352
353                 //****************** file prep work ******************************//
354                 #ifdef USE_MPI
355                         int pid;
356                         int tag = 2001;
357                         MPI_Status status; 
358                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
359                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
360                         
361                         if (pid == 0) { //only process 0 converts and splits
362                         
363                 #endif
364                 
365                 //if user gave a phylip file convert to column file
366                 if (format == "phylip") {
367                         estart = time(NULL);
368                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
369                         
370                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
371                         
372                         NameAssignment* nameMap = NULL;
373                         convert->setFormat("phylip");
374                         convert->read(nameMap);
375                         
376                         if (m->control_pressed) {  delete convert;  return 0;  }
377                         
378                         distfile = convert->getOutputFile();
379                 
380                         //if no names file given with phylip file, create it
381                         ListVector* listToMakeNameFile =  convert->getListVector();
382                         if (namefile == "") {  //you need to make a namefile for split matrix
383                                 ofstream out;
384                                 namefile = phylipfile + ".names";
385                                 m->openOutputFile(namefile, out);
386                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
387                                         string bin = listToMakeNameFile->get(i);
388                                         out << bin << '\t' << bin << endl;
389                                 }
390                                 out.close();
391                         }
392                         delete listToMakeNameFile;
393                         delete convert;
394                         
395                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
396                 }
397                 if (m->control_pressed) { return 0; }
398                 
399                 estart = time(NULL);
400                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
401                 
402                 //split matrix into non-overlapping groups
403                 SplitMatrix* split;
404                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
405                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
406                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);     }
407                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
408                 
409                 split->split();
410                 
411                 if (m->control_pressed) { delete split; return 0; }
412                 
413                 singletonName = split->getSingletonNames();
414                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
415                 delete split;
416                 
417                 //output a merged distance file
418                 if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
419                         
420                                 
421                 if (m->control_pressed) { return 0; }
422                 
423                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
424                 estart = time(NULL);
425                 
426                 //****************** break up files between processes and cluster each file set ******************************//
427         #ifdef USE_MPI
428                         ////you are process 0 from above////
429                         
430                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...                           
431                         dividedNames.resize(processors);
432                                         
433                         //for each file group figure out which process will complete it
434                         //want to divide the load intelligently so the big files are spread between processes
435                         for (int i = 0; i < distName.size(); i++) { 
436                                 int processToAssign = (i+1) % processors; 
437                                 if (processToAssign == 0) { processToAssign = processors; }
438                                                 
439                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
440                         }
441                                         
442                         //not lets reverse the order of ever other process, so we balance big files running with little ones
443                         for (int i = 0; i < processors; i++) {
444                                 int remainder = ((i+1) % processors);
445                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
446                         }
447                         
448                         
449                         //send each child the list of files it needs to process
450                         for(int i = 1; i < processors; i++) { 
451                                 //send number of file pairs
452                                 int num = dividedNames[i].size();
453                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
454                                 
455                                 for (int j = 0; j < num; j++) { //send filenames to process i
456                                         char tempDistFileName[1024];
457                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
458                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
459                                         
460                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
461                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
462                                         
463                                         char tempNameFileName[1024];
464                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
465                                         int lengthName = (dividedNames[i][j].begin()->second).length();
466
467                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
468                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
469                                 }
470                         }
471                         
472                         //process your share
473                         listFileNames = cluster(dividedNames[0], labels);
474                         
475                         //receive the other processes info
476                         for(int i = 1; i < processors; i++) { 
477                                 int num = dividedNames[i].size();
478                                 
479                                 double tempCutoff;
480                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
481                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
482                                 
483                                 //send list filenames to root process
484                                 for (int j = 0; j < num; j++) {  
485                                         int lengthList = 0;
486                                         char tempListFileName[1024];
487                                 
488                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
489                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
490                                 
491                                         string myListFileName = tempListFileName;
492                                         myListFileName = myListFileName.substr(0, lengthList);
493                                         
494                                         listFileNames.push_back(myListFileName);
495                                 }
496                                 
497                                 //send Labels to root process
498                                 int numLabels = 0;
499                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
500                                 
501                                 for (int j = 0; j < numLabels; j++) {  
502                                         int lengthLabel = 0;
503                                         char tempLabel[100];
504                                 
505                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
506                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
507                                 
508                                         string myLabel = tempLabel;
509                                         myLabel = myLabel.substr(0, lengthLabel);
510                         
511                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
512                                 }
513                         }
514                         
515                 }else { //you are a child process
516                         vector < map<string, string> >  myNames;
517                         
518                         //recieve the files you need to process
519                         //receive number of file pairs
520                         int num = 0;
521                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
522                         
523                         myNames.resize(num);
524         
525                         for (int j = 0; j < num; j++) { //receive filenames to process 
526                                 int lengthDist = 0;
527                                 char tempDistFileName[1024];
528                                 
529                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
530                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
531                                 
532                                 string myDistFileName = tempDistFileName;
533                                 myDistFileName = myDistFileName.substr(0, lengthDist);
534                         
535                                 int lengthName = 0;
536                                 char tempNameFileName[1024];
537                                 
538                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
539                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
540                                 
541                                 string myNameFileName = tempNameFileName;
542                                 myNameFileName = myNameFileName.substr(0, lengthName);
543                                 
544                                 //save file name
545                                 myNames[j][myDistFileName] = myNameFileName;
546                         }
547         
548                         //process them
549                         listFileNames = cluster(myNames, labels);
550                         
551                         //send cutoff
552                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
553                         
554                         //send list filenames to root process
555                         for (int j = 0; j < num; j++) {  
556                                 char tempListFileName[1024];
557                                 strcpy(tempListFileName, listFileNames[j].c_str());
558                                 int lengthList = listFileNames[j].length();
559                                         
560                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
561                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
562                         }
563                         
564                         //send Labels to root process
565                         int numLabels = labels.size();
566                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
567                         
568                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
569                                 char tempLabel[100];
570                                 strcpy(tempLabel, (*it).c_str());
571                                 int lengthLabel = (*it).length();
572                                         
573                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
574                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
575                         }
576                 }
577                 
578                 //make everyone wait
579                 MPI_Barrier(MPI_COMM_WORLD);
580                 
581         #else
582                 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
583                 //sanity check
584                 if (processors > distName.size()) { processors = distName.size(); }
585                 
586                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
587                                 if(processors == 1){
588                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
589                                 }else{
590                                         listFileNames = createProcesses(distName, labels);
591                 }
592                 #else
593                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
594                 #endif
595         #endif  
596                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
597                 
598                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
599                 
600                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
601                 
602                 //****************** merge list file and create rabund and sabund files ******************************//
603                 estart = time(NULL);
604                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
605                 
606                 #ifdef USE_MPI
607                         if (pid == 0) { //only process 0 merges
608                 #endif
609
610                 ListVector* listSingle;
611                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
612                 
613                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
614                 
615                 mergeLists(listFileNames, labelBins, listSingle);
616
617                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
618                 
619                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
620                 
621                 //set list file as new current listfile
622                 string current = "";
623                 itTypes = outputTypes.find("list");
624                 if (itTypes != outputTypes.end()) {
625                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
626                 }
627                 
628                 //set rabund file as new current rabundfile
629                 itTypes = outputTypes.find("rabund");
630                 if (itTypes != outputTypes.end()) {
631                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
632                 }
633                 
634                 //set sabund file as new current sabundfile
635                 itTypes = outputTypes.find("sabund");
636                 if (itTypes != outputTypes.end()) {
637                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
638                 }
639                                 
640                 m->mothurOutEndLine();
641                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
642                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
643                 m->mothurOutEndLine();
644                 
645                 #ifdef USE_MPI
646                         } //only process 0 merges
647                         
648                         //make everyone wait
649                         MPI_Barrier(MPI_COMM_WORLD);
650                 #endif
651
652                 return 0;
653         }
654         catch(exception& e) {
655                 m->errorOut(e, "ClusterSplitCommand", "execute");
656                 exit(1);
657         }
658 }
659 //**********************************************************************************************************************
660 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
661         try {
662                                 
663                 map<float, int> labelBin;
664                 vector<float> orderFloat;
665                 int numSingleBins;
666                 
667                 //read in singletons
668                 if (singleton != "none") {
669                         ifstream in;
670                         m->openInputFile(singleton, in);
671                                 
672                         string firstCol, secondCol;
673                         listSingle = new ListVector();
674                         while (!in.eof()) {
675                                 in >> firstCol >> secondCol; m->gobble(in);
676                                 listSingle->push_back(secondCol);
677                         }
678                         in.close();
679                         m->mothurRemove(singleton);
680                         
681                         numSingleBins = listSingle->getNumBins();
682                 }else{  listSingle = NULL; numSingleBins = 0;  }
683                 
684                 //go through users set and make them floats so we can sort them 
685                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
686                         float temp = -10.0;
687
688                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
689                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
690                         
691                         if (temp <= cutoff) {
692                                 orderFloat.push_back(temp);
693                                 labelBin[temp] = numSingleBins; //initialize numbins 
694                         }
695                 }
696         
697                 //sort order
698                 sort(orderFloat.begin(), orderFloat.end());
699                 userLabels.clear();
700                         
701                 //get the list info from each file
702                 for (int k = 0; k < listNames.size(); k++) {
703         
704                         if (m->control_pressed) {  
705                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
706                                 for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
707                                 return labelBin;
708                         }
709                         
710                         InputData* input = new InputData(listNames[k], "list");
711                         ListVector* list = input->getListVector();
712                         string lastLabel = list->getLabel();
713                         
714                         string filledInList = listNames[k] + "filledInTemp";
715                         ofstream outFilled;
716                         m->openOutputFile(filledInList, outFilled);
717         
718                         //for each label needed
719                         for(int l = 0; l < orderFloat.size(); l++){
720                         
721                                 string thisLabel;
722                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
723                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
724
725                                 //this file has reached the end
726                                 if (list == NULL) { 
727                                         list = input->getListVector(lastLabel, true); 
728                                 }else{  //do you have the distance, or do you need to fill in
729                                                 
730                                         float labelFloat;
731                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
732                                         else { convert(list->getLabel(), labelFloat); }
733
734                                         //check for missing labels
735                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
736                                                 //if its bigger get last label, otherwise keep it
737                                                 delete list;
738                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
739                                         }
740                                         lastLabel = list->getLabel();
741                                 }
742                                 
743                                 //print to new file
744                                 list->setLabel(thisLabel);
745                                 list->print(outFilled);
746                 
747                                 //update labelBin
748                                 labelBin[orderFloat[l]] += list->getNumBins();
749                                                                         
750                                 delete list;
751                                                                         
752                                 list = input->getListVector();
753                         }
754                         
755                         if (list != NULL) { delete list; }
756                         delete input;
757                         
758                         outFilled.close();
759                         m->mothurRemove(listNames[k]);
760                         rename(filledInList.c_str(), listNames[k].c_str());
761                 }
762                 
763                 return labelBin;
764         }
765         catch(exception& e) {
766                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
767                 exit(1);
768         }
769 }
770 //**********************************************************************************************************************
771 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
772         try {
773                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
774                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
775                 
776         string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund");
777         string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund");
778         string listFileName = fileroot+ tag + "." + getOutputFileNameTag("list");
779         
780                 m->openOutputFile(sabundFileName,       outSabund);
781                 m->openOutputFile(rabundFileName,       outRabund);
782                 m->openOutputFile(listFileName, outList);
783                 
784                 outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
785                 outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
786                 outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);               
787                 map<float, int>::iterator itLabel;
788
789                 //for each label needed
790                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
791                         
792                         string thisLabel;
793                         if (itLabel->first == -1) { thisLabel = "unique"; }
794                         else { thisLabel = toString(itLabel->first,  length-1);  } 
795                         
796                         outList << thisLabel << '\t' << itLabel->second << '\t';
797
798                         RAbundVector* rabund = new RAbundVector();
799                         rabund->setLabel(thisLabel);
800
801                         //add in singletons
802                         if (listSingle != NULL) {
803                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
804                                         outList << listSingle->get(j) << '\t';
805                                         rabund->push_back(m->getNumNames(listSingle->get(j)));
806                                 }
807                         }
808                         
809                         //get the list info from each file
810                         for (int k = 0; k < listNames.size(); k++) {
811         
812                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } delete rabund; return 0; }
813                                 
814                                 InputData* input = new InputData(listNames[k], "list");
815                                 ListVector* list = input->getListVector(thisLabel);
816                                 
817                                 //this file has reached the end
818                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
819                                 else {          
820                                         for (int j = 0; j < list->getNumBins(); j++) {
821                                                 outList << list->get(j) << '\t';
822                                                 rabund->push_back(m->getNumNames(list->get(j)));
823                                         }
824                                         delete list;
825                                 }
826                                 delete input;
827                         }
828                         
829                         SAbundVector sabund = rabund->getSAbundVector();
830                         
831                         sabund.print(outSabund);
832                         rabund->print(outRabund);
833                         outList << endl;
834                         
835                         delete rabund;
836                 }
837                 
838                 outList.close();
839                 outRabund.close();
840                 outSabund.close();
841                 
842                 if (listSingle != NULL) { delete listSingle;  }
843                 
844                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
845                 
846                 return 0;
847         }
848         catch(exception& e) {
849                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
850                 exit(1);
851         }
852 }
853
854 //**********************************************************************************************************************
855
856 void ClusterSplitCommand::printData(ListVector* oldList){
857         try {
858                 string label = oldList->getLabel();
859                 RAbundVector oldRAbund = oldList->getRAbundVector();
860                 
861                 oldRAbund.setLabel(label);
862                 if (m->isTrue(showabund)) {
863                         oldRAbund.getSAbundVector().print(cout);
864                 }
865                 oldRAbund.print(outRabund);
866                 oldRAbund.getSAbundVector().print(outSabund);
867         
868                 oldList->print(outList);
869         }
870         catch(exception& e) {
871                 m->errorOut(e, "ClusterSplitCommand", "printData");
872                 exit(1);
873         }
874 }
875 //**********************************************************************************************************************
876 vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
877         try {
878         
879         vector<string> listFiles;
880         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
881         dividedNames.resize(processors);
882         
883         //for each file group figure out which process will complete it
884         //want to divide the load intelligently so the big files are spread between processes
885         for (int i = 0; i < distName.size(); i++) { 
886             //cout << i << endl;
887             int processToAssign = (i+1) % processors; 
888             if (processToAssign == 0) { processToAssign = processors; }
889             
890             dividedNames[(processToAssign-1)].push_back(distName[i]);
891             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
892         }
893         
894         //not lets reverse the order of ever other process, so we balance big files running with little ones
895         for (int i = 0; i < processors; i++) {
896             //cout << i << endl;
897             int remainder = ((i+1) % processors);
898             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
899         }
900         
901         if (m->control_pressed) { return listFiles; }
902         
903         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
904                 int process = 1;
905                 processIDS.clear();
906                 
907                 //loop through and create all the processes you want
908                 while (process != processors) {
909                         int pid = fork();
910                         
911                         if (pid > 0) {
912                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
913                                 process++;
914                         }else if (pid == 0){
915                                 set<string> labels;
916                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
917                                 
918                                 //write out names to file
919                                 string filename = toString(getpid()) + ".temp";
920                                 ofstream out;
921                                 m->openOutputFile(filename, out);
922                                 out << tag << endl;
923                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
924                                 out.close();
925                                 
926                                 //print out labels
927                                 ofstream outLabels;
928                                 filename = toString(getpid()) + ".temp.labels";
929                                 m->openOutputFile(filename, outLabels);
930                                 
931                                 outLabels << cutoff << endl;
932                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
933                                         outLabels << (*it) << endl;
934                                 }
935                                 outLabels.close();
936
937                                 exit(0);
938                         }else { 
939                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
940                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
941                                 exit(0);
942                         }
943                 }
944                 
945         //do your part
946         listFiles = cluster(dividedNames[0], labels);
947         
948                 //force parent to wait until all the processes are done
949                 for (int i=0;i< processIDS.size();i++) { 
950                         int temp = processIDS[i];
951                         wait(&temp);
952                 }
953         
954         //get list of list file names from each process
955         for(int i=0;i<processIDS.size();i++){
956             string filename = toString(processIDS[i]) + ".temp";
957             ifstream in;
958             m->openInputFile(filename, in);
959             
960             in >> tag; m->gobble(in);
961             
962             while(!in.eof()) {
963                 string tempName;
964                 in >> tempName; m->gobble(in);
965                 listFiles.push_back(tempName);
966             }
967             in.close();
968             m->mothurRemove((toString(processIDS[i]) + ".temp"));
969             
970             //get labels
971             filename = toString(processIDS[i]) + ".temp.labels";
972             ifstream in2;
973             m->openInputFile(filename, in2);
974             
975             float tempCutoff;
976             in2 >> tempCutoff; m->gobble(in2);
977             if (tempCutoff < cutoff) { cutoff = tempCutoff; }
978             
979             while(!in2.eof()) {
980                 string tempName;
981                 in2 >> tempName; m->gobble(in2);
982                 if (labels.count(tempName) == 0) { labels.insert(tempName); }
983             }
984             in2.close();
985             m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
986         }
987         
988
989     #else
990        
991         //////////////////////////////////////////////////////////////////////////////////////////////////////
992                 //Windows version shared memory, so be careful when passing variables through the clusterData struct. 
993                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
994                 //Taking advantage of shared memory to allow both threads to add labels.
995                 //////////////////////////////////////////////////////////////////////////////////////////////////////
996                 
997                 vector<clusterData*> pDataArray; 
998                 DWORD   dwThreadIdArray[processors-1];
999                 HANDLE  hThreadArray[processors-1]; 
1000                 
1001                 //Create processor worker threads.
1002                 for( int i=1; i<processors; i++ ){
1003                         // Allocate memory for thread data.
1004                         clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1005                         pDataArray.push_back(tempCluster);
1006                         processIDS.push_back(i);
1007             
1008                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1009                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1010                         hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);  
1011             
1012                 }
1013         
1014         //do your part
1015         listFiles = cluster(dividedNames[0], labels);
1016         
1017                 //Wait until all threads have terminated.
1018                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1019                 
1020                 //Close all thread handles and free memory allocations.
1021                 for(int i=0; i < pDataArray.size(); i++){
1022             //get tag
1023             tag = pDataArray[i]->tag;
1024             //get listfiles created
1025             for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1026             //get labels
1027             set<string>::iterator it;
1028             for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1029                         //check cutoff
1030             if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1031                         CloseHandle(hThreadArray[i]);
1032                         delete pDataArray[i];
1033                 }
1034
1035         #endif          
1036         
1037         return listFiles;
1038         
1039         }
1040         catch(exception& e) {
1041                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1042                 exit(1);
1043         }
1044 }
1045 //**********************************************************************************************************************
1046
1047 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1048         try {
1049                 
1050                 vector<string> listFileNames;
1051                 double smallestCutoff = cutoff;
1052                 
1053                 //cluster each distance file
1054                 for (int i = 0; i < distNames.size(); i++) {
1055             
1056                         string thisNamefile = distNames[i].begin()->second;
1057                         string thisDistFile = distNames[i].begin()->first;
1058                         
1059                         string listFileName = "";
1060             if (classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff);   }
1061             else            {  listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff);          }
1062
1063                         if (m->control_pressed) { //clean up
1064                                 for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
1065                                 listFileNames.clear(); return listFileNames;
1066                         }
1067             
1068             listFileNames.push_back(listFileName);
1069         }
1070                 
1071                 cutoff = smallestCutoff;
1072                                         
1073                 return listFileNames;
1074         
1075         }
1076         catch(exception& e) {
1077                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1078                 exit(1);
1079         }
1080
1081
1082 }
1083 //**********************************************************************************************************************
1084 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1085         try {
1086         string listFileName = "";
1087         
1088         ListVector* list = NULL;
1089         ListVector oldList;
1090         RAbundVector* rabund = NULL;
1091         
1092 #ifdef USE_MPI
1093         int pid;
1094         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1095         
1096         //output your files too
1097         if (pid != 0) {
1098             cout << endl << "Reading " << thisDistFile << endl;
1099         }
1100 #endif
1101
1102         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1103         
1104         NameAssignment* nameMap = new NameAssignment(thisNamefile);
1105         nameMap->readMap();
1106                                 
1107                 //reads phylip file storing data in 2D vector, also fills list and rabund
1108         bool sim = false;
1109                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1110                 cluster->readPhylipFile(thisDistFile, nameMap);
1111                 tag = cluster->getTag();
1112         
1113                 if (m->control_pressed) { delete cluster; return 0; }
1114                 
1115                 list = cluster->getListVector();
1116                 rabund = cluster->getRAbundVector();
1117         
1118                 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1119                 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1120         listFileName = fileroot+ tag + ".list";
1121         
1122         ofstream listFile;
1123                 m->openOutputFile(fileroot+ tag + ".list",      listFile);
1124                 
1125                 float previousDist = 0.00000;
1126                 float rndPreviousDist = 0.00000;
1127                 oldList = *list;
1128                 
1129 #ifdef USE_MPI
1130         //output your files too
1131         if (pid != 0) {
1132             cout << endl << "Clustering " << thisDistFile << endl;
1133         }
1134 #endif
1135
1136         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1137         
1138                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1139                         if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  return listFileName;  }
1140             
1141                         cluster->update(cutoff);
1142             
1143                         float dist = cluster->getSmallDist();
1144                         float rndDist;
1145                         if (hard) {
1146                                 rndDist = m->ceilDist(dist, precision); 
1147                         }else{
1148                                 rndDist = m->roundDist(dist, precision); 
1149                         }
1150             
1151             if(previousDist <= 0.0000 && dist != previousDist){
1152                 oldList.setLabel("unique");
1153                 oldList.print(listFile);
1154                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1155             }
1156             else if(rndDist != rndPreviousDist){
1157                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1158                 oldList.print(listFile);
1159                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1160             }
1161
1162             
1163                         previousDist = dist;
1164                         rndPreviousDist = rndDist;
1165                         oldList = *list;
1166                 }
1167         
1168                 if(previousDist <= 0.0000){
1169             oldList.setLabel("unique");
1170             oldList.print(listFile);
1171             if (labels.count("unique") == 0) { labels.insert("unique"); }
1172         }
1173         else if(rndPreviousDist<cutoff){
1174             oldList.setLabel(toString(rndPreviousDist,  length-1));
1175             oldList.print(listFile);
1176             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1177         }
1178
1179         
1180                 listFile.close();
1181                 
1182                 delete cluster; delete nameMap; delete list; delete rabund;
1183
1184         
1185         return listFileName;
1186         
1187         }
1188         catch(exception& e) {
1189                 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1190                 exit(1);
1191         }
1192 }
1193
1194 //**********************************************************************************************************************
1195 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1196         try {
1197         string listFileName = "";
1198         
1199         Cluster* cluster = NULL;
1200         SparseDistanceMatrix* matrix = NULL;
1201         ListVector* list = NULL;
1202         ListVector oldList;
1203         RAbundVector* rabund = NULL;
1204         
1205         if (m->control_pressed) { return listFileName; }
1206         
1207 #ifdef USE_MPI
1208         int pid;
1209         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1210         
1211         //output your files too
1212         if (pid != 0) {
1213             cout << endl << "Reading " << thisDistFile << endl;
1214         }
1215 #endif
1216         
1217         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1218         
1219         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
1220         read->setCutoff(cutoff);
1221         
1222         NameAssignment* nameMap = new NameAssignment(thisNamefile);
1223         nameMap->readMap();
1224         read->read(nameMap);
1225         
1226         if (m->control_pressed) {  delete read; delete nameMap; return listFileName; }
1227         
1228         list = read->getListVector();
1229         oldList = *list;
1230         matrix = read->getDMatrix();
1231         
1232         delete read;  read = NULL;
1233         delete nameMap; nameMap = NULL;
1234         
1235         
1236 #ifdef USE_MPI
1237         //output your files too
1238         if (pid != 0) {
1239             cout << endl << "Clustering " << thisDistFile << endl;
1240         }
1241 #endif
1242         
1243         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1244                 
1245         rabund = new RAbundVector(list->getRAbundVector());
1246         
1247         //create cluster
1248         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1249         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1250         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
1251         tag = cluster->getTag();
1252                 
1253         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1254         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1255         
1256         ofstream listFile;
1257         m->openOutputFile(fileroot+ tag + ".list",      listFile);
1258                 
1259         listFileName = fileroot+ tag + ".list";
1260         
1261         float previousDist = 0.00000;
1262         float rndPreviousDist = 0.00000;
1263         
1264         oldList = *list;
1265         
1266         print_start = true;
1267         start = time(NULL);
1268         double saveCutoff = cutoff;
1269                 
1270         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1271             
1272             if (m->control_pressed) { //clean up
1273                 delete matrix; delete list;     delete cluster; delete rabund;
1274                 listFile.close();
1275                 m->mothurRemove(listFileName);  
1276                 return listFileName;
1277             }
1278             
1279             cluster->update(saveCutoff);
1280             
1281             float dist = matrix->getSmallDist();
1282             float rndDist;
1283             if (hard) {
1284                 rndDist = m->ceilDist(dist, precision); 
1285             }else{
1286                 rndDist = m->roundDist(dist, precision); 
1287             }
1288             
1289             if(previousDist <= 0.0000 && dist != previousDist){
1290                 oldList.setLabel("unique");
1291                 oldList.print(listFile);
1292                 if (labels.count("unique") == 0) {  labels.insert("unique");  }
1293             }
1294             else if(rndDist != rndPreviousDist){
1295                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1296                 oldList.print(listFile);
1297                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1298             }
1299             
1300             previousDist = dist;
1301             rndPreviousDist = rndDist;
1302             oldList = *list;
1303         }
1304         
1305                 
1306         if(previousDist <= 0.0000){
1307             oldList.setLabel("unique");
1308             oldList.print(listFile);
1309             if (labels.count("unique") == 0) { labels.insert("unique"); }
1310         }
1311         else if(rndPreviousDist<cutoff){
1312             oldList.setLabel(toString(rndPreviousDist,  length-1));
1313             oldList.print(listFile);
1314             if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1315         }
1316         
1317         delete matrix; delete list;     delete cluster; delete rabund; 
1318         matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1319         listFile.close();
1320         
1321         if (m->control_pressed) { //clean up
1322             m->mothurRemove(listFileName);      
1323             return listFileName;
1324         }
1325         
1326         m->mothurRemove(thisDistFile);
1327         m->mothurRemove(thisNamefile);
1328         
1329         if (saveCutoff != cutoff) { 
1330             if (hard)   {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1331             else                {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1332                         
1333             m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  
1334         }
1335         
1336         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1337         
1338         return listFileName;
1339         
1340         }
1341         catch(exception& e) {
1342                 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1343                 exit(1);
1344         }
1345 }
1346 //**********************************************************************************************************************
1347
1348 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1349         try{
1350                 
1351 #ifdef USE_MPI
1352                 int pid;
1353                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1354                 
1355                 if (pid != 0) {
1356 #endif
1357                 
1358                 string thisOutputDir = outputDir;
1359                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1360                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("column");
1361                 m->mothurRemove(outputFileName);
1362                 
1363                 
1364                 for (int i = 0; i < distNames.size(); i++) {
1365                         if (m->control_pressed) {  return 0; }
1366                         
1367                         string thisDistFile = distNames[i].begin()->first;
1368                         
1369                         m->appendFiles(thisDistFile, outputFileName);
1370                 }       
1371                         
1372                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1373                         
1374 #ifdef USE_MPI
1375                 }
1376 #endif
1377                                 
1378                 return 0;       
1379                 
1380                 
1381         }
1382         catch(exception& e) {
1383                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1384                 exit(1);
1385         }
1386 }
1387 //**********************************************************************************************************************