]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
fixed cluster.split with average method
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
17
18 //**********************************************************************************************************************
19 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
20 ClusterSplitCommand::ClusterSplitCommand(string option)  {
21         try{
22                 globaldata = GlobalData::getInstance();
23                 abort = false;
24                 format = "";
25                 
26                 //allow user to run help
27                 if(option == "help") { help(); abort = true; }
28                 
29                 else {
30                         //valid paramters for this command
31                         string Array[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
32                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
33                         
34                         OptionParser parser(option);
35                         map<string,string> parameters = parser.getParameters();
36                         
37                         ValidParameters validParameter;
38                 
39                         //check to make sure all parameters are valid for command
40                         map<string,string>::iterator it;
41                         for (it = parameters.begin(); it != parameters.end(); it++) { 
42                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
43                                         abort = true;
44                                 }
45                         }
46                         
47                         globaldata->newRead();
48                         
49                         //if the user changes the output directory command factory will send this info to us in the output parameter 
50                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
51                         
52                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
53                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
54                         if (inputDir == "not found"){   inputDir = "";          }
55                         else {
56                                 string path;
57                                 it = parameters.find("phylip");
58                                 //user has given a template file
59                                 if(it != parameters.end()){ 
60                                         path = hasPath(it->second);
61                                         //if the user has not given a path then, add inputdir. else leave path alone.
62                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
63                                 }
64                                 
65                                 it = parameters.find("column");
66                                 //user has given a template file
67                                 if(it != parameters.end()){ 
68                                         path = hasPath(it->second);
69                                         //if the user has not given a path then, add inputdir. else leave path alone.
70                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
71                                 }
72                                 
73                                 it = parameters.find("name");
74                                 //user has given a template file
75                                 if(it != parameters.end()){ 
76                                         path = hasPath(it->second);
77                                         //if the user has not given a path then, add inputdir. else leave path alone.
78                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
79                                 }
80                                 
81                                 it = parameters.find("taxonomy");
82                                 //user has given a template file
83                                 if(it != parameters.end()){ 
84                                         path = hasPath(it->second);
85                                         //if the user has not given a path then, add inputdir. else leave path alone.
86                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
87                                 }
88                                 
89                                 it = parameters.find("fasta");
90                                 //user has given a template file
91                                 if(it != parameters.end()){ 
92                                         path = hasPath(it->second);
93                                         //if the user has not given a path then, add inputdir. else leave path alone.
94                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
95                                 }
96                         }
97                         
98                         //check for required parameters
99                         phylipfile = validParameter.validFile(parameters, "phylip", true);
100                         if (phylipfile == "not open") { abort = true; }
101                         else if (phylipfile == "not found") { phylipfile = ""; }        
102                         else {  distfile = phylipfile;  format = "phylip";      }
103                         
104                         columnfile = validParameter.validFile(parameters, "column", true);
105                         if (columnfile == "not open") { abort = true; } 
106                         else if (columnfile == "not found") { columnfile = ""; }
107                         else {  distfile = columnfile; format = "column";       }
108                         
109                         namefile = validParameter.validFile(parameters, "name", true);
110                         if (namefile == "not open") { abort = true; }   
111                         else if (namefile == "not found") { namefile = ""; }
112                         
113                         fastafile = validParameter.validFile(parameters, "fasta", true);
114                         if (fastafile == "not open") { abort = true; }  
115                         else if (fastafile == "not found") { fastafile = ""; }
116                         else { distfile = fastafile;  splitmethod = "fasta";  }
117                         
118                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
119                         if (taxFile == "not open") { abort = true; }    
120                         else if (taxFile == "not found") { taxFile = ""; }
121                         
122                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
123                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
124                 
125                         if (columnfile != "") {
126                                 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
127                         }
128                         
129                         if (fastafile != "") {
130                                 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
131                                 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
132                         }
133                                         
134                         //check for optional parameter and set defaults
135                         // ...at some point should added some additional type checking...
136                         //get user cutoff and precision or use defaults
137                         string temp;
138                         temp = validParameter.validFile(parameters, "precision", false);
139                         if (temp == "not found") { temp = "100"; }
140                         //saves precision legnth for formatting below
141                         length = temp.length();
142                         convert(temp, precision); 
143                         
144                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
145                         hard = isTrue(temp);
146                         
147                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
148                         large = isTrue(temp);
149                         
150                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
151                         convert(temp, processors); 
152                         
153                         temp = validParameter.validFile(parameters, "splitmethod", false);      
154                         if (splitmethod != "fasta") {
155                                 if (temp == "not found")  { splitmethod = "distance"; }
156                                 else {  splitmethod = temp; }
157                         }
158                         
159                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
160                         convert(temp, cutoff); 
161                         cutoff += (5 / (precision * 10.0));  
162                         
163                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
164                         convert(temp, taxLevelCutoff); 
165                         
166                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
167                         
168                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
169                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
170                         
171                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
172                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
173                         
174                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
175
176                         showabund = validParameter.validFile(parameters, "showabund", false);
177                         if (showabund == "not found") { showabund = "T"; }
178
179                         timing = validParameter.validFile(parameters, "timing", false);
180                         if (timing == "not found") { timing = "F"; }
181                         
182                 }
183         }
184         catch(exception& e) {
185                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
186                 exit(1);
187         }
188 }
189
190 //**********************************************************************************************************************
191
192 void ClusterSplitCommand::help(){
193         try {
194                 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
195                 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
196                 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
197                 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n");
198                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and split the distance file based on those groups. \n");
199                 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n");
200                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
201                 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
202                 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
203                 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
204                 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
205                 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
206                 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
207                 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
208                 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
209                 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n");
210                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
211                 #ifdef USE_MPI
212                 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
213                 #endif
214                 m->mothurOut("The cluster.split command should be in the following format: \n");
215                 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
216                 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");       
217
218         }
219         catch(exception& e) {
220                 m->errorOut(e, "ClusterSplitCommand", "help");
221                 exit(1);
222         }
223 }
224
225 //**********************************************************************************************************************
226
227 ClusterSplitCommand::~ClusterSplitCommand(){}
228
229 //**********************************************************************************************************************
230
231 int ClusterSplitCommand::execute(){
232         try {
233         
234                 if (abort == true) {    return 0;       }
235                 
236                 time_t estart;
237                 vector<string> listFileNames;
238                 set<string> labels;
239                 string singletonName = "";
240                 double saveCutoff = cutoff;
241
242                 //****************** file prep work ******************************//
243                 #ifdef USE_MPI
244                         int pid;
245                         int tag = 2001;
246                         MPI_Status status; 
247                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
248                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
249                         
250                         if (pid == 0) { //only process 0 converts and splits
251                         
252                 #endif
253                 
254                 //if user gave a phylip file convert to column file
255                 if (format == "phylip") {
256                         estart = time(NULL);
257                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
258                         
259                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
260                         
261                         NameAssignment* nameMap = NULL;
262                         convert->setFormat("phylip");
263                         convert->read(nameMap);
264                         
265                         if (m->control_pressed) {  delete convert;  return 0;  }
266                         
267                         distfile = convert->getOutputFile();
268                 
269                         //if no names file given with phylip file, create it
270                         ListVector* listToMakeNameFile =  convert->getListVector();
271                         if (namefile == "") {  //you need to make a namefile for split matrix
272                                 ofstream out;
273                                 namefile = phylipfile + ".names";
274                                 openOutputFile(namefile, out);
275                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
276                                         string bin = listToMakeNameFile->get(i);
277                                         out << bin << '\t' << bin << endl;
278                                 }
279                                 out.close();
280                         }
281                         delete listToMakeNameFile;
282                         delete convert;
283                         
284                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
285                 }
286                 if (m->control_pressed) { return 0; }
287                 
288                 estart = time(NULL);
289                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
290                 
291                 //split matrix into non-overlapping groups
292                 SplitMatrix* split;
293                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
294                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
295                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, splitmethod, processors, outputDir);      }
296                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
297                 
298                 split->split();
299                 
300                 if (m->control_pressed) { delete split; return 0; }
301                 
302                 singletonName = split->getSingletonNames();
303                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
304                 delete split;
305                 
306                 if (m->control_pressed) { return 0; }
307                 
308                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
309                 estart = time(NULL);
310                 
311                 //****************** break up files between processes and cluster each file set ******************************//
312         #ifdef USE_MPI
313                         ////you are process 0 from above////
314                         
315                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...                           
316                         dividedNames.resize(processors);
317                                         
318                         //for each file group figure out which process will complete it
319                         //want to divide the load intelligently so the big files are spread between processes
320                         int count = 1;
321                         for (int i = 0; i < distName.size(); i++) { 
322                                 int processToAssign = (i+1) % processors; 
323                                 if (processToAssign == 0) { processToAssign = processors; }
324                                                 
325                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
326                         }
327                                         
328                         //not lets reverse the order of ever other process, so we balance big files running with little ones
329                         for (int i = 0; i < processors; i++) {
330                                 int remainder = ((i+1) % processors);
331                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
332                         }
333                         
334                         
335                         //send each child the list of files it needs to process
336                         for(int i = 1; i < processors; i++) { 
337                                 //send number of file pairs
338                                 int num = dividedNames[i].size();
339                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
340                                 
341                                 for (int j = 0; j < num; j++) { //send filenames to process i
342                                         char tempDistFileName[1024];
343                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
344                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
345                                         
346                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
347                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
348                                         
349                                         char tempNameFileName[1024];
350                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
351                                         int lengthName = (dividedNames[i][j].begin()->second).length();
352
353                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
354                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
355                                 }
356                         }
357                         
358                         //process your share
359                         listFileNames = cluster(dividedNames[0], labels);
360                         
361                         //receive the other processes info
362                         for(int i = 1; i < processors; i++) { 
363                                 int num = dividedNames[i].size();
364                                 
365                                 double tempCutoff;
366                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
367                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
368                                 
369                                 //send list filenames to root process
370                                 for (int j = 0; j < num; j++) {  
371                                         int lengthList = 0;
372                                         char tempListFileName[1024];
373                                 
374                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
375                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
376                                 
377                                         string myListFileName = tempListFileName;
378                                         myListFileName = myListFileName.substr(0, lengthList);
379                                         
380                                         listFileNames.push_back(myListFileName);
381                                 }
382                                 
383                                 //send Labels to root process
384                                 int numLabels = 0;
385                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
386                                 
387                                 for (int j = 0; j < numLabels; j++) {  
388                                         int lengthLabel = 0;
389                                         char tempLabel[100];
390                                 
391                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
392                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
393                                 
394                                         string myLabel = tempLabel;
395                                         myLabel = myLabel.substr(0, lengthLabel);
396                         
397                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
398                                 }
399                         }
400                         
401                 }else { //you are a child process
402                         vector < map<string, string> >  myNames;
403                         
404                         //recieve the files you need to process
405                         //receive number of file pairs
406                         int num = 0;
407                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
408                         
409                         myNames.resize(num);
410         
411                         for (int j = 0; j < num; j++) { //receive filenames to process 
412                                 int lengthDist = 0;
413                                 char tempDistFileName[1024];
414                                 
415                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
416                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
417                                 
418                                 string myDistFileName = tempDistFileName;
419                                 myDistFileName = myDistFileName.substr(0, lengthDist);
420                         
421                                 int lengthName = 0;
422                                 char tempNameFileName[1024];
423                                 
424                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
425                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
426                                 
427                                 string myNameFileName = tempNameFileName;
428                                 myNameFileName = myNameFileName.substr(0, lengthName);
429                                 
430                                 //save file name
431                                 myNames[j][myDistFileName] = myNameFileName;
432                         }
433         
434                         //process them
435                         listFileNames = cluster(myNames, labels);
436                         
437                         //send cutoff
438                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
439                         
440                         //send list filenames to root process
441                         for (int j = 0; j < num; j++) {  
442                                 char tempListFileName[1024];
443                                 strcpy(tempListFileName, listFileNames[j].c_str());
444                                 int lengthList = listFileNames[j].length();
445                                         
446                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
447                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
448                         }
449                         
450                         //send Labels to root process
451                         int numLabels = labels.size();
452                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
453                         
454                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
455                                 char tempLabel[100];
456                                 strcpy(tempLabel, (*it).c_str());
457                                 int lengthLabel = (*it).length();
458                                         
459                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
460                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
461                         }
462                 }
463                 
464                 //make everyone wait
465                 MPI_Barrier(MPI_COMM_WORLD);
466                 
467         #else
468
469                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
470                                 if(processors == 1){
471                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
472                                 }else{
473                                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
474                                         dividedNames.resize(processors);
475                                         
476                                         //for each file group figure out which process will complete it
477                                         //want to divide the load intelligently so the big files are spread between processes
478                                         int count = 1;
479                                         for (int i = 0; i < distName.size(); i++) { 
480                                                 int processToAssign = (i+1) % processors; 
481                                                 if (processToAssign == 0) { processToAssign = processors; }
482                                                 
483                                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
484                                         }
485                                         
486                                         //not lets reverse the order of ever other process, so we balance big files running with little ones
487                                         for (int i = 0; i < processors; i++) {
488                                                 int remainder = ((i+1) % processors);
489                                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
490                                         }
491                                         
492                                         createProcesses(dividedNames);
493                                                         
494                                         if (m->control_pressed) { return 0; }
495
496                                         //get list of list file names from each process
497                                         for(int i=0;i<processors;i++){
498                                                 string filename = toString(processIDS[i]) + ".temp";
499                                                 ifstream in;
500                                                 openInputFile(filename, in);
501                                                 
502                                                 in >> tag; gobble(in);
503                                                 
504                                                 while(!in.eof()) {
505                                                         string tempName;
506                                                         in >> tempName; gobble(in);
507                                                         listFileNames.push_back(tempName);
508                                                 }
509                                                 in.close();
510                                                 remove((toString(processIDS[i]) + ".temp").c_str());
511                                                 
512                                                 //get labels
513                                                 filename = toString(processIDS[i]) + ".temp.labels";
514                                                 ifstream in2;
515                                                 openInputFile(filename, in2);
516                                                 
517                                                 float tempCutoff;
518                                                 in2 >> tempCutoff; gobble(in2);
519                                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
520                                                 
521                                                 while(!in2.eof()) {
522                                                         string tempName;
523                                                         in2 >> tempName; gobble(in2);
524                                                         if (labels.count(tempName) == 0) { labels.insert(tempName); }
525                                                 }
526                                                 in2.close();
527                                                 remove((toString(processIDS[i]) + ".temp.labels").c_str());
528                                         }
529                                 }
530                 #else
531                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
532                 #endif
533         #endif  
534                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
535                 
536                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
537                 
538                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
539                 
540                 //****************** merge list file and create rabund and sabund files ******************************//
541                 estart = time(NULL);
542                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
543                 
544                 #ifdef USE_MPI
545                         if (pid == 0) { //only process 0 merges
546                 #endif
547
548                 ListVector* listSingle;
549                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
550                 
551                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
552                 
553                 mergeLists(listFileNames, labelBins, listSingle);
554
555                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
556                 
557                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
558                 
559                 m->mothurOutEndLine();
560                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
561                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
562                 m->mothurOutEndLine();
563                 
564                 #ifdef USE_MPI
565                         } //only process 0 merges
566                         
567                         //make everyone wait
568                         MPI_Barrier(MPI_COMM_WORLD);
569                 #endif
570
571                 return 0;
572         }
573         catch(exception& e) {
574                 m->errorOut(e, "ClusterSplitCommand", "execute");
575                 exit(1);
576         }
577 }
578 //**********************************************************************************************************************
579 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
580         try {
581                                 
582                 map<float, int> labelBin;
583                 vector<float> orderFloat;
584                 int numSingleBins;
585                 
586                 //read in singletons
587                 if (singleton != "none") {
588                         ifstream in;
589                         openInputFile(singleton, in);
590                                 
591                         string firstCol, secondCol;
592                         listSingle = new ListVector();
593                         while (!in.eof()) {
594                                 in >> firstCol >> secondCol; gobble(in);
595                                 listSingle->push_back(secondCol);
596                         }
597                         in.close();
598                         remove(singleton.c_str());
599                         
600                         numSingleBins = listSingle->getNumBins();
601                 }else{  listSingle = NULL; numSingleBins = 0;  }
602                 
603                 //go through users set and make them floats so we can sort them 
604                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
605                         float temp = -10.0;
606
607                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
608                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
609                         
610                         if (temp <= cutoff) {
611                                 orderFloat.push_back(temp);
612                                 labelBin[temp] = numSingleBins; //initialize numbins 
613                         }
614                 }
615         
616                 //sort order
617                 sort(orderFloat.begin(), orderFloat.end());
618                 userLabels.clear();
619                         
620                 //get the list info from each file
621                 for (int k = 0; k < listNames.size(); k++) {
622         
623                         if (m->control_pressed) {  
624                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
625                                 for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
626                                 return labelBin;
627                         }
628                         
629                         InputData* input = new InputData(listNames[k], "list");
630                         ListVector* list = input->getListVector();
631                         string lastLabel = list->getLabel();
632                         
633                         string filledInList = listNames[k] + "filledInTemp";
634                         ofstream outFilled;
635                         openOutputFile(filledInList, outFilled);
636         
637                         //for each label needed
638                         for(int l = 0; l < orderFloat.size(); l++){
639                         
640                                 string thisLabel;
641                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
642                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
643
644                                 //this file has reached the end
645                                 if (list == NULL) { 
646                                         list = input->getListVector(lastLabel, true); 
647                                 }else{  //do you have the distance, or do you need to fill in
648                                                 
649                                         float labelFloat;
650                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
651                                         else { convert(list->getLabel(), labelFloat); }
652
653                                         //check for missing labels
654                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
655                                                 //if its bigger get last label, otherwise keep it
656                                                 delete list;
657                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
658                                         }
659                                         lastLabel = list->getLabel();
660                                 }
661                                 
662                                 //print to new file
663                                 list->setLabel(thisLabel);
664                                 list->print(outFilled);
665                 
666                                 //update labelBin
667                                 labelBin[orderFloat[l]] += list->getNumBins();
668                                                                         
669                                 delete list;
670                                                                         
671                                 list = input->getListVector();
672                         }
673                         
674                         if (list != NULL) { delete list; }
675                         delete input;
676                         
677                         outFilled.close();
678                         remove(listNames[k].c_str());
679                         rename(filledInList.c_str(), listNames[k].c_str());
680                 }
681                 
682                 return labelBin;
683         }
684         catch(exception& e) {
685                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
686                 exit(1);
687         }
688 }
689 //**********************************************************************************************************************
690 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
691         try {
692                 if (outputDir == "") { outputDir += hasPath(distfile); }
693                 fileroot = outputDir + getRootName(getSimpleName(distfile));
694                 
695                 openOutputFile(fileroot+ tag + ".sabund",       outSabund);
696                 openOutputFile(fileroot+ tag + ".rabund",       outRabund);
697                 openOutputFile(fileroot+ tag + ".list",         outList);
698                                 
699                 outputNames.push_back(fileroot+ tag + ".sabund");
700                 outputNames.push_back(fileroot+ tag + ".rabund");
701                 outputNames.push_back(fileroot+ tag + ".list");
702                 
703                 map<float, int>::iterator itLabel;
704
705                 //for each label needed
706                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
707                         
708                         string thisLabel;
709                         if (itLabel->first == -1) { thisLabel = "unique"; }
710                         else { thisLabel = toString(itLabel->first,  length-1);  } 
711                         
712                         outList << thisLabel << '\t' << itLabel->second << '\t';
713
714                         RAbundVector* rabund = new RAbundVector();
715                         rabund->setLabel(thisLabel);
716
717                         //add in singletons
718                         if (listSingle != NULL) {
719                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
720                                         outList << listSingle->get(j) << '\t';
721                                         rabund->push_back(getNumNames(listSingle->get(j)));
722                                 }
723                         }
724                         
725                         //get the list info from each file
726                         for (int k = 0; k < listNames.size(); k++) {
727         
728                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
729                                 
730                                 InputData* input = new InputData(listNames[k], "list");
731                                 ListVector* list = input->getListVector(thisLabel);
732                                 
733                                 //this file has reached the end
734                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
735                                 else {          
736                                         for (int j = 0; j < list->getNumBins(); j++) {
737                                                 outList << list->get(j) << '\t';
738                                                 rabund->push_back(getNumNames(list->get(j)));
739                                         }
740                                         delete list;
741                                 }
742                                 delete input;
743                         }
744                         
745                         SAbundVector sabund = rabund->getSAbundVector();
746                         
747                         sabund.print(outSabund);
748                         rabund->print(outRabund);
749                         outList << endl;
750                         
751                         delete rabund;
752                 }
753                 
754                 outList.close();
755                 outRabund.close();
756                 outSabund.close();
757                 
758                 if (listSingle != NULL) { delete listSingle;  }
759                 
760                 for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
761                 
762                 return 0;
763         }
764         catch(exception& e) {
765                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
766                 exit(1);
767         }
768 }
769
770 //**********************************************************************************************************************
771
772 void ClusterSplitCommand::printData(ListVector* oldList){
773         try {
774                 string label = oldList->getLabel();
775                 RAbundVector oldRAbund = oldList->getRAbundVector();
776                 
777                 oldRAbund.setLabel(label);
778                 if (isTrue(showabund)) {
779                         oldRAbund.getSAbundVector().print(cout);
780                 }
781                 oldRAbund.print(outRabund);
782                 oldRAbund.getSAbundVector().print(outSabund);
783         
784                 oldList->print(outList);
785         }
786         catch(exception& e) {
787                 m->errorOut(e, "ClusterSplitCommand", "printData");
788                 exit(1);
789         }
790 }
791 //**********************************************************************************************************************
792 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
793         try {
794         
795         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
796                 int process = 0;
797                 int exitCommand = 1;
798                 processIDS.clear();
799                 
800                 //loop through and create all the processes you want
801                 while (process != processors) {
802                         int pid = fork();
803                         
804                         if (pid > 0) {
805                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
806                                 process++;
807                         }else if (pid == 0){
808                                 set<string> labels;
809                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
810                                 
811                                 //write out names to file
812                                 string filename = toString(getpid()) + ".temp";
813                                 ofstream out;
814                                 openOutputFile(filename, out);
815                                 out << tag << endl;
816                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
817                                 out.close();
818                                 
819                                 //print out labels
820                                 ofstream outLabels;
821                                 filename = toString(getpid()) + ".temp.labels";
822                                 openOutputFile(filename, outLabels);
823                                 
824                                 outLabels << cutoff << endl;
825                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
826                                         outLabels << (*it) << endl;
827                                 }
828                                 outLabels.close();
829
830                                 exit(0);
831                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
832                 }
833                 
834                 //force parent to wait until all the processes are done
835                 for (int i=0;i<processors;i++) { 
836                         int temp = processIDS[i];
837                         wait(&temp);
838                 }
839                 
840                 return exitCommand;
841         #endif          
842         
843         }
844         catch(exception& e) {
845                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
846                 exit(1);
847         }
848 }
849 //**********************************************************************************************************************
850
851 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
852         try {
853                 Cluster* cluster;
854                 SparseMatrix* matrix;
855                 ListVector* list;
856                 ListVector oldList;
857                 RAbundVector* rabund;
858                 
859                 vector<string> listFileNames;
860                 
861                 double smallestCutoff = cutoff;
862                 
863                 //cluster each distance file
864                 for (int i = 0; i < distNames.size(); i++) {
865                         if (m->control_pressed) { return listFileNames; }
866                         
867                         string thisNamefile = distNames[i].begin()->second;
868                         string thisDistFile = distNames[i].begin()->first;
869                         
870                         //read in distance file
871                         globaldata->setNameFile(thisNamefile);
872                         globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
873                         
874                         #ifdef USE_MPI
875                                 int pid;
876                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
877                                 
878                                 //output your files too
879                                 if (pid != 0) {
880                                         cout << endl << "Reading " << thisDistFile << endl;
881                                 }
882                         #endif
883                         
884                         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
885                         
886                         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
887                         read->setCutoff(cutoff);
888
889                         NameAssignment* nameMap = new NameAssignment(thisNamefile);
890                         nameMap->readMap();
891                         read->read(nameMap);
892                         
893                         if (m->control_pressed) {  delete read; delete nameMap; return listFileNames; }
894                         
895                         list = read->getListVector();
896                         oldList = *list;
897                         matrix = read->getMatrix();
898                         
899                         delete read; 
900                         delete nameMap; 
901                         
902                         
903                         #ifdef USE_MPI
904                                 //output your files too
905                                 if (pid != 0) {
906                                         cout << endl << "Clustering " << thisDistFile << endl;
907                                 }
908                         #endif
909                         
910                         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
911                 
912                         rabund = new RAbundVector(list->getRAbundVector());
913                         
914                         //create cluster
915                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
916                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
917                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
918                         tag = cluster->getTag();
919                 
920                         if (outputDir == "") { outputDir += hasPath(thisDistFile); }
921                         fileroot = outputDir + getRootName(getSimpleName(thisDistFile));
922                         
923                         ofstream listFile;
924                         openOutputFile(fileroot+ tag + ".list", listFile);
925                 
926                         listFileNames.push_back(fileroot+ tag + ".list");
927                 
928                         time_t estart = time(NULL);
929                         
930                         float previousDist = 0.00000;
931                         float rndPreviousDist = 0.00000;
932                         
933                         oldList = *list;
934
935                         print_start = true;
936                         start = time(NULL);
937                         double saveCutoff = cutoff;
938                 
939                         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
940                 
941                                 if (m->control_pressed) { //clean up
942                                         delete matrix; delete list;     delete cluster; delete rabund;
943                                         listFile.close();
944                                         for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
945                                         listFileNames.clear(); return listFileNames;
946                                 }
947                 
948                                 cluster->update(saveCutoff);
949         
950                                 float dist = matrix->getSmallDist();
951                                 float rndDist;
952                                 if (hard) {
953                                         rndDist = ceilDist(dist, precision); 
954                                 }else{
955                                         rndDist = roundDist(dist, precision); 
956                                 }
957
958                                 if(previousDist <= 0.0000 && dist != previousDist){
959                                         oldList.setLabel("unique");
960                                         oldList.print(listFile);
961                                         if (labels.count("unique") == 0) {  labels.insert("unique");  }
962                                 }
963                                 else if(rndDist != rndPreviousDist){
964                                         oldList.setLabel(toString(rndPreviousDist,  length-1));
965                                         oldList.print(listFile);
966                                         if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
967                                 }
968                 
969                                 previousDist = dist;
970                                 rndPreviousDist = rndDist;
971                                 oldList = *list;
972                         }
973
974                 
975                         if(previousDist <= 0.0000){
976                                 oldList.setLabel("unique");
977                                 oldList.print(listFile);
978                                 if (labels.count("unique") == 0) { labels.insert("unique"); }
979                         }
980                         else if(rndPreviousDist<cutoff){
981                                 oldList.setLabel(toString(rndPreviousDist,  length-1));
982                                 oldList.print(listFile);
983                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
984                         }
985         
986                         delete matrix; delete list;     delete cluster; delete rabund; 
987                         listFile.close();
988                         
989                         if (m->control_pressed) { //clean up
990                                 for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
991                                 listFileNames.clear(); return listFileNames;
992                         }
993                         
994                         remove(thisDistFile.c_str());
995                         remove(thisNamefile.c_str());
996                         
997                         if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  }
998                         
999                         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1000                 }
1001                 
1002                 cutoff = smallestCutoff;
1003                                         
1004                 return listFileNames;
1005         
1006         }
1007         catch(exception& e) {
1008                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1009                 exit(1);
1010         }
1011
1012
1013 }
1014
1015 //**********************************************************************************************************************