]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
added pipeline commands which involved change to command factory and command class...
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
17
18
19 //**********************************************************************************************************************
20 vector<string> ClusterSplitCommand::getValidParameters(){       
21         try {
22                 string AlignArray[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
23                 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
24                 return myArray;
25         }
26         catch(exception& e) {
27                 m->errorOut(e, "ClusterSplitCommand", "getValidParameters");
28                 exit(1);
29         }
30 }
31 //**********************************************************************************************************************
32 ClusterSplitCommand::ClusterSplitCommand(){     
33         try {
34                 //initialize outputTypes
35                 vector<string> tempOutNames;
36                 outputTypes["list"] = tempOutNames;
37                 outputTypes["rabund"] = tempOutNames;
38                 outputTypes["sabund"] = tempOutNames;
39         }
40         catch(exception& e) {
41                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
42                 exit(1);
43         }
44 }
45 //**********************************************************************************************************************
46 vector<string> ClusterSplitCommand::getRequiredParameters(){    
47         try {
48                 string Array[] =  {"fasta","phylip","column","or"};
49                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
50                 return myArray;
51         }
52         catch(exception& e) {
53                 m->errorOut(e, "ClusterSplitCommand", "getRequiredParameters");
54                 exit(1);
55         }
56 }
57 //**********************************************************************************************************************
58 vector<string> ClusterSplitCommand::getRequiredFiles(){ 
59         try {
60                 vector<string> myArray;
61                 return myArray;
62         }
63         catch(exception& e) {
64                 m->errorOut(e, "ClusterSplitCommand", "getRequiredFiles");
65                 exit(1);
66         }
67 }
68 //**********************************************************************************************************************
69 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
70 ClusterSplitCommand::ClusterSplitCommand(string option)  {
71         try{
72                 globaldata = GlobalData::getInstance();
73                 abort = false;
74                 format = "";
75                 
76                 //allow user to run help
77                 if(option == "help") { help(); abort = true; }
78                 
79                 else {
80                         //valid paramters for this command
81                         string Array[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
82                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
83                         
84                         OptionParser parser(option);
85                         map<string,string> parameters = parser.getParameters();
86                         
87                         ValidParameters validParameter("cluster.split");
88                 
89                         //check to make sure all parameters are valid for command
90                         map<string,string>::iterator it;
91                         for (it = parameters.begin(); it != parameters.end(); it++) { 
92                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
93                                         abort = true;
94                                 }
95                         }
96                         
97                         //initialize outputTypes
98                         vector<string> tempOutNames;
99                         outputTypes["list"] = tempOutNames;
100                         outputTypes["rabund"] = tempOutNames;
101                         outputTypes["sabund"] = tempOutNames;
102                         
103                         globaldata->newRead();
104                         
105                         //if the user changes the output directory command factory will send this info to us in the output parameter 
106                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
107                         
108                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
109                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
110                         if (inputDir == "not found"){   inputDir = "";          }
111                         else {
112                                 string path;
113                                 it = parameters.find("phylip");
114                                 //user has given a template file
115                                 if(it != parameters.end()){ 
116                                         path = m->hasPath(it->second);
117                                         //if the user has not given a path then, add inputdir. else leave path alone.
118                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
119                                 }
120                                 
121                                 it = parameters.find("column");
122                                 //user has given a template file
123                                 if(it != parameters.end()){ 
124                                         path = m->hasPath(it->second);
125                                         //if the user has not given a path then, add inputdir. else leave path alone.
126                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
127                                 }
128                                 
129                                 it = parameters.find("name");
130                                 //user has given a template file
131                                 if(it != parameters.end()){ 
132                                         path = m->hasPath(it->second);
133                                         //if the user has not given a path then, add inputdir. else leave path alone.
134                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
135                                 }
136                                 
137                                 it = parameters.find("taxonomy");
138                                 //user has given a template file
139                                 if(it != parameters.end()){ 
140                                         path = m->hasPath(it->second);
141                                         //if the user has not given a path then, add inputdir. else leave path alone.
142                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
143                                 }
144                                 
145                                 it = parameters.find("fasta");
146                                 //user has given a template file
147                                 if(it != parameters.end()){ 
148                                         path = m->hasPath(it->second);
149                                         //if the user has not given a path then, add inputdir. else leave path alone.
150                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
151                                 }
152                         }
153                         
154                         //check for required parameters
155                         phylipfile = validParameter.validFile(parameters, "phylip", true);
156                         if (phylipfile == "not open") { abort = true; }
157                         else if (phylipfile == "not found") { phylipfile = ""; }        
158                         else {  distfile = phylipfile;  format = "phylip";      }
159                         
160                         columnfile = validParameter.validFile(parameters, "column", true);
161                         if (columnfile == "not open") { abort = true; } 
162                         else if (columnfile == "not found") { columnfile = ""; }
163                         else {  distfile = columnfile; format = "column";       }
164                         
165                         namefile = validParameter.validFile(parameters, "name", true);
166                         if (namefile == "not open") { abort = true; }   
167                         else if (namefile == "not found") { namefile = ""; }
168                         
169                         fastafile = validParameter.validFile(parameters, "fasta", true);
170                         if (fastafile == "not open") { abort = true; }  
171                         else if (fastafile == "not found") { fastafile = ""; }
172                         else { distfile = fastafile;  splitmethod = "fasta";  }
173                         
174                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
175                         if (taxFile == "not open") { abort = true; }    
176                         else if (taxFile == "not found") { taxFile = ""; }
177                         
178                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
179                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
180                 
181                         if (columnfile != "") {
182                                 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
183                         }
184                         
185                         if (fastafile != "") {
186                                 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
187                                 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
188                         }
189                                         
190                         //check for optional parameter and set defaults
191                         // ...at some point should added some additional type checking...
192                         //get user cutoff and precision or use defaults
193                         string temp;
194                         temp = validParameter.validFile(parameters, "precision", false);
195                         if (temp == "not found") { temp = "100"; }
196                         //saves precision legnth for formatting below
197                         length = temp.length();
198                         convert(temp, precision); 
199                         
200                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
201                         hard = m->isTrue(temp);
202                         
203                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
204                         large = m->isTrue(temp);
205                         
206                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
207                         convert(temp, processors); 
208                         
209                         temp = validParameter.validFile(parameters, "splitmethod", false);      
210                         if (splitmethod != "fasta") {
211                                 if (temp == "not found")  { splitmethod = "distance"; }
212                                 else {  splitmethod = temp; }
213                         }
214                         
215                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
216                         convert(temp, cutoff); 
217                         cutoff += (5 / (precision * 10.0));  
218                         
219                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
220                         convert(temp, taxLevelCutoff); 
221                         
222                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
223                         
224                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
225                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
226                         
227                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
228                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
229                         
230                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
231
232                         showabund = validParameter.validFile(parameters, "showabund", false);
233                         if (showabund == "not found") { showabund = "T"; }
234
235                         timing = validParameter.validFile(parameters, "timing", false);
236                         if (timing == "not found") { timing = "F"; }
237                         
238                 }
239         }
240         catch(exception& e) {
241                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
242                 exit(1);
243         }
244 }
245
246 //**********************************************************************************************************************
247
248 void ClusterSplitCommand::help(){
249         try {
250                 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
251                 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
252                 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
253                 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n");
254                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n");
255                 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n");
256                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
257                 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
258                 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
259                 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
260                 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
261                 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
262                 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
263                 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
264                 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
265                 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n");
266                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
267                 #ifdef USE_MPI
268                 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
269                 #endif
270                 m->mothurOut("The cluster.split command should be in the following format: \n");
271                 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
272                 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");       
273
274         }
275         catch(exception& e) {
276                 m->errorOut(e, "ClusterSplitCommand", "help");
277                 exit(1);
278         }
279 }
280
281 //**********************************************************************************************************************
282
283 ClusterSplitCommand::~ClusterSplitCommand(){}
284
285 //**********************************************************************************************************************
286
287 int ClusterSplitCommand::execute(){
288         try {
289         
290                 if (abort == true) {    return 0;       }
291                 
292                 time_t estart;
293                 vector<string> listFileNames;
294                 set<string> labels;
295                 string singletonName = "";
296                 double saveCutoff = cutoff;
297
298                 //****************** file prep work ******************************//
299                 #ifdef USE_MPI
300                         int pid;
301                         int tag = 2001;
302                         MPI_Status status; 
303                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
304                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
305                         
306                         if (pid == 0) { //only process 0 converts and splits
307                         
308                 #endif
309                 
310                 //if user gave a phylip file convert to column file
311                 if (format == "phylip") {
312                         estart = time(NULL);
313                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
314                         
315                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
316                         
317                         NameAssignment* nameMap = NULL;
318                         convert->setFormat("phylip");
319                         convert->read(nameMap);
320                         
321                         if (m->control_pressed) {  delete convert;  return 0;  }
322                         
323                         distfile = convert->getOutputFile();
324                 
325                         //if no names file given with phylip file, create it
326                         ListVector* listToMakeNameFile =  convert->getListVector();
327                         if (namefile == "") {  //you need to make a namefile for split matrix
328                                 ofstream out;
329                                 namefile = phylipfile + ".names";
330                                 m->openOutputFile(namefile, out);
331                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
332                                         string bin = listToMakeNameFile->get(i);
333                                         out << bin << '\t' << bin << endl;
334                                 }
335                                 out.close();
336                         }
337                         delete listToMakeNameFile;
338                         delete convert;
339                         
340                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
341                 }
342                 if (m->control_pressed) { return 0; }
343                 
344                 estart = time(NULL);
345                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
346                 
347                 //split matrix into non-overlapping groups
348                 SplitMatrix* split;
349                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
350                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
351                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir);      }
352                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
353                 
354                 split->split();
355                 
356                 if (m->control_pressed) { delete split; return 0; }
357                 
358                 singletonName = split->getSingletonNames();
359                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
360                 delete split;
361                 
362                 if (m->control_pressed) { return 0; }
363                 
364                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
365                 estart = time(NULL);
366                 
367                 //****************** break up files between processes and cluster each file set ******************************//
368         #ifdef USE_MPI
369                         ////you are process 0 from above////
370                         
371                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...                           
372                         dividedNames.resize(processors);
373                                         
374                         //for each file group figure out which process will complete it
375                         //want to divide the load intelligently so the big files are spread between processes
376                         int count = 1;
377                         for (int i = 0; i < distName.size(); i++) { 
378                                 int processToAssign = (i+1) % processors; 
379                                 if (processToAssign == 0) { processToAssign = processors; }
380                                                 
381                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
382                         }
383                                         
384                         //not lets reverse the order of ever other process, so we balance big files running with little ones
385                         for (int i = 0; i < processors; i++) {
386                                 int remainder = ((i+1) % processors);
387                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
388                         }
389                         
390                         
391                         //send each child the list of files it needs to process
392                         for(int i = 1; i < processors; i++) { 
393                                 //send number of file pairs
394                                 int num = dividedNames[i].size();
395                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
396                                 
397                                 for (int j = 0; j < num; j++) { //send filenames to process i
398                                         char tempDistFileName[1024];
399                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
400                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
401                                         
402                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
403                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
404                                         
405                                         char tempNameFileName[1024];
406                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
407                                         int lengthName = (dividedNames[i][j].begin()->second).length();
408
409                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
410                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
411                                 }
412                         }
413                         
414                         //process your share
415                         listFileNames = cluster(dividedNames[0], labels);
416                         
417                         //receive the other processes info
418                         for(int i = 1; i < processors; i++) { 
419                                 int num = dividedNames[i].size();
420                                 
421                                 double tempCutoff;
422                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
423                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
424                                 
425                                 //send list filenames to root process
426                                 for (int j = 0; j < num; j++) {  
427                                         int lengthList = 0;
428                                         char tempListFileName[1024];
429                                 
430                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
431                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
432                                 
433                                         string myListFileName = tempListFileName;
434                                         myListFileName = myListFileName.substr(0, lengthList);
435                                         
436                                         listFileNames.push_back(myListFileName);
437                                 }
438                                 
439                                 //send Labels to root process
440                                 int numLabels = 0;
441                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
442                                 
443                                 for (int j = 0; j < numLabels; j++) {  
444                                         int lengthLabel = 0;
445                                         char tempLabel[100];
446                                 
447                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
448                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
449                                 
450                                         string myLabel = tempLabel;
451                                         myLabel = myLabel.substr(0, lengthLabel);
452                         
453                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
454                                 }
455                         }
456                         
457                 }else { //you are a child process
458                         vector < map<string, string> >  myNames;
459                         
460                         //recieve the files you need to process
461                         //receive number of file pairs
462                         int num = 0;
463                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
464                         
465                         myNames.resize(num);
466         
467                         for (int j = 0; j < num; j++) { //receive filenames to process 
468                                 int lengthDist = 0;
469                                 char tempDistFileName[1024];
470                                 
471                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
472                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
473                                 
474                                 string myDistFileName = tempDistFileName;
475                                 myDistFileName = myDistFileName.substr(0, lengthDist);
476                         
477                                 int lengthName = 0;
478                                 char tempNameFileName[1024];
479                                 
480                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
481                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
482                                 
483                                 string myNameFileName = tempNameFileName;
484                                 myNameFileName = myNameFileName.substr(0, lengthName);
485                                 
486                                 //save file name
487                                 myNames[j][myDistFileName] = myNameFileName;
488                         }
489         
490                         //process them
491                         listFileNames = cluster(myNames, labels);
492                         
493                         //send cutoff
494                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
495                         
496                         //send list filenames to root process
497                         for (int j = 0; j < num; j++) {  
498                                 char tempListFileName[1024];
499                                 strcpy(tempListFileName, listFileNames[j].c_str());
500                                 int lengthList = listFileNames[j].length();
501                                         
502                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
503                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
504                         }
505                         
506                         //send Labels to root process
507                         int numLabels = labels.size();
508                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
509                         
510                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
511                                 char tempLabel[100];
512                                 strcpy(tempLabel, (*it).c_str());
513                                 int lengthLabel = (*it).length();
514                                         
515                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
516                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
517                         }
518                 }
519                 
520                 //make everyone wait
521                 MPI_Barrier(MPI_COMM_WORLD);
522                 
523         #else
524
525                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
526                                 if(processors == 1){
527                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
528                                 }else{
529                                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
530                                         dividedNames.resize(processors);
531                                         
532                                         //for each file group figure out which process will complete it
533                                         //want to divide the load intelligently so the big files are spread between processes
534                                         int count = 1;
535                                         for (int i = 0; i < distName.size(); i++) { 
536                                                 int processToAssign = (i+1) % processors; 
537                                                 if (processToAssign == 0) { processToAssign = processors; }
538                                                 
539                                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
540                                         }
541                                         
542                                         //not lets reverse the order of ever other process, so we balance big files running with little ones
543                                         for (int i = 0; i < processors; i++) {
544                                                 int remainder = ((i+1) % processors);
545                                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
546                                         }
547                                         
548                                         createProcesses(dividedNames);
549                                                         
550                                         if (m->control_pressed) { return 0; }
551
552                                         //get list of list file names from each process
553                                         for(int i=0;i<processors;i++){
554                                                 string filename = toString(processIDS[i]) + ".temp";
555                                                 ifstream in;
556                                                 m->openInputFile(filename, in);
557                                                 
558                                                 in >> tag; m->gobble(in);
559                                                 
560                                                 while(!in.eof()) {
561                                                         string tempName;
562                                                         in >> tempName; m->gobble(in);
563                                                         listFileNames.push_back(tempName);
564                                                 }
565                                                 in.close();
566                                                 remove((toString(processIDS[i]) + ".temp").c_str());
567                                                 
568                                                 //get labels
569                                                 filename = toString(processIDS[i]) + ".temp.labels";
570                                                 ifstream in2;
571                                                 m->openInputFile(filename, in2);
572                                                 
573                                                 float tempCutoff;
574                                                 in2 >> tempCutoff; m->gobble(in2);
575                                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
576                                                 
577                                                 while(!in2.eof()) {
578                                                         string tempName;
579                                                         in2 >> tempName; m->gobble(in2);
580                                                         if (labels.count(tempName) == 0) { labels.insert(tempName); }
581                                                 }
582                                                 in2.close();
583                                                 remove((toString(processIDS[i]) + ".temp.labels").c_str());
584                                         }
585                                 }
586                 #else
587                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
588                 #endif
589         #endif  
590                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
591                 
592                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
593                 
594                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
595                 
596                 //****************** merge list file and create rabund and sabund files ******************************//
597                 estart = time(NULL);
598                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
599                 
600                 #ifdef USE_MPI
601                         if (pid == 0) { //only process 0 merges
602                 #endif
603
604                 ListVector* listSingle;
605                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
606                 
607                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
608                 
609                 mergeLists(listFileNames, labelBins, listSingle);
610
611                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
612                 
613                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
614                 
615                 m->mothurOutEndLine();
616                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
617                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
618                 m->mothurOutEndLine();
619                 
620                 #ifdef USE_MPI
621                         } //only process 0 merges
622                         
623                         //make everyone wait
624                         MPI_Barrier(MPI_COMM_WORLD);
625                 #endif
626
627                 return 0;
628         }
629         catch(exception& e) {
630                 m->errorOut(e, "ClusterSplitCommand", "execute");
631                 exit(1);
632         }
633 }
634 //**********************************************************************************************************************
635 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
636         try {
637                                 
638                 map<float, int> labelBin;
639                 vector<float> orderFloat;
640                 int numSingleBins;
641                 
642                 //read in singletons
643                 if (singleton != "none") {
644                         ifstream in;
645                         m->openInputFile(singleton, in);
646                                 
647                         string firstCol, secondCol;
648                         listSingle = new ListVector();
649                         while (!in.eof()) {
650                                 in >> firstCol >> secondCol; m->gobble(in);
651                                 listSingle->push_back(secondCol);
652                         }
653                         in.close();
654                         remove(singleton.c_str());
655                         
656                         numSingleBins = listSingle->getNumBins();
657                 }else{  listSingle = NULL; numSingleBins = 0;  }
658                 
659                 //go through users set and make them floats so we can sort them 
660                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
661                         float temp = -10.0;
662
663                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
664                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
665                         
666                         if (temp <= cutoff) {
667                                 orderFloat.push_back(temp);
668                                 labelBin[temp] = numSingleBins; //initialize numbins 
669                         }
670                 }
671         
672                 //sort order
673                 sort(orderFloat.begin(), orderFloat.end());
674                 userLabels.clear();
675                         
676                 //get the list info from each file
677                 for (int k = 0; k < listNames.size(); k++) {
678         
679                         if (m->control_pressed) {  
680                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
681                                 for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
682                                 return labelBin;
683                         }
684                         
685                         InputData* input = new InputData(listNames[k], "list");
686                         ListVector* list = input->getListVector();
687                         string lastLabel = list->getLabel();
688                         
689                         string filledInList = listNames[k] + "filledInTemp";
690                         ofstream outFilled;
691                         m->openOutputFile(filledInList, outFilled);
692         
693                         //for each label needed
694                         for(int l = 0; l < orderFloat.size(); l++){
695                         
696                                 string thisLabel;
697                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
698                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
699
700                                 //this file has reached the end
701                                 if (list == NULL) { 
702                                         list = input->getListVector(lastLabel, true); 
703                                 }else{  //do you have the distance, or do you need to fill in
704                                                 
705                                         float labelFloat;
706                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
707                                         else { convert(list->getLabel(), labelFloat); }
708
709                                         //check for missing labels
710                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
711                                                 //if its bigger get last label, otherwise keep it
712                                                 delete list;
713                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
714                                         }
715                                         lastLabel = list->getLabel();
716                                 }
717                                 
718                                 //print to new file
719                                 list->setLabel(thisLabel);
720                                 list->print(outFilled);
721                 
722                                 //update labelBin
723                                 labelBin[orderFloat[l]] += list->getNumBins();
724                                                                         
725                                 delete list;
726                                                                         
727                                 list = input->getListVector();
728                         }
729                         
730                         if (list != NULL) { delete list; }
731                         delete input;
732                         
733                         outFilled.close();
734                         remove(listNames[k].c_str());
735                         rename(filledInList.c_str(), listNames[k].c_str());
736                 }
737                 
738                 return labelBin;
739         }
740         catch(exception& e) {
741                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
742                 exit(1);
743         }
744 }
745 //**********************************************************************************************************************
746 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
747         try {
748                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
749                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
750                 
751                 m->openOutputFile(fileroot+ tag + ".sabund",    outSabund);
752                 m->openOutputFile(fileroot+ tag + ".rabund",    outRabund);
753                 m->openOutputFile(fileroot+ tag + ".list",              outList);
754                                 
755                 outputNames.push_back(fileroot+ tag + ".sabund");  outputTypes["list"].push_back(fileroot+ tag + ".list");
756                 outputNames.push_back(fileroot+ tag + ".rabund");  outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
757                 outputNames.push_back(fileroot+ tag + ".list");    outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
758                 
759                 map<float, int>::iterator itLabel;
760
761                 //for each label needed
762                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
763                         
764                         string thisLabel;
765                         if (itLabel->first == -1) { thisLabel = "unique"; }
766                         else { thisLabel = toString(itLabel->first,  length-1);  } 
767                         
768                         outList << thisLabel << '\t' << itLabel->second << '\t';
769
770                         RAbundVector* rabund = new RAbundVector();
771                         rabund->setLabel(thisLabel);
772
773                         //add in singletons
774                         if (listSingle != NULL) {
775                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
776                                         outList << listSingle->get(j) << '\t';
777                                         rabund->push_back(m->getNumNames(listSingle->get(j)));
778                                 }
779                         }
780                         
781                         //get the list info from each file
782                         for (int k = 0; k < listNames.size(); k++) {
783         
784                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
785                                 
786                                 InputData* input = new InputData(listNames[k], "list");
787                                 ListVector* list = input->getListVector(thisLabel);
788                                 
789                                 //this file has reached the end
790                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
791                                 else {          
792                                         for (int j = 0; j < list->getNumBins(); j++) {
793                                                 outList << list->get(j) << '\t';
794                                                 rabund->push_back(m->getNumNames(list->get(j)));
795                                         }
796                                         delete list;
797                                 }
798                                 delete input;
799                         }
800                         
801                         SAbundVector sabund = rabund->getSAbundVector();
802                         
803                         sabund.print(outSabund);
804                         rabund->print(outRabund);
805                         outList << endl;
806                         
807                         delete rabund;
808                 }
809                 
810                 outList.close();
811                 outRabund.close();
812                 outSabund.close();
813                 
814                 if (listSingle != NULL) { delete listSingle;  }
815                 
816                 for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
817                 
818                 return 0;
819         }
820         catch(exception& e) {
821                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
822                 exit(1);
823         }
824 }
825
826 //**********************************************************************************************************************
827
828 void ClusterSplitCommand::printData(ListVector* oldList){
829         try {
830                 string label = oldList->getLabel();
831                 RAbundVector oldRAbund = oldList->getRAbundVector();
832                 
833                 oldRAbund.setLabel(label);
834                 if (m->isTrue(showabund)) {
835                         oldRAbund.getSAbundVector().print(cout);
836                 }
837                 oldRAbund.print(outRabund);
838                 oldRAbund.getSAbundVector().print(outSabund);
839         
840                 oldList->print(outList);
841         }
842         catch(exception& e) {
843                 m->errorOut(e, "ClusterSplitCommand", "printData");
844                 exit(1);
845         }
846 }
847 //**********************************************************************************************************************
848 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
849         try {
850         
851         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
852                 int process = 0;
853                 int exitCommand = 1;
854                 processIDS.clear();
855                 
856                 //loop through and create all the processes you want
857                 while (process != processors) {
858                         int pid = fork();
859                         
860                         if (pid > 0) {
861                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
862                                 process++;
863                         }else if (pid == 0){
864                                 set<string> labels;
865                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
866                                 
867                                 //write out names to file
868                                 string filename = toString(getpid()) + ".temp";
869                                 ofstream out;
870                                 m->openOutputFile(filename, out);
871                                 out << tag << endl;
872                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
873                                 out.close();
874                                 
875                                 //print out labels
876                                 ofstream outLabels;
877                                 filename = toString(getpid()) + ".temp.labels";
878                                 m->openOutputFile(filename, outLabels);
879                                 
880                                 outLabels << cutoff << endl;
881                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
882                                         outLabels << (*it) << endl;
883                                 }
884                                 outLabels.close();
885
886                                 exit(0);
887                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
888                 }
889                 
890                 //force parent to wait until all the processes are done
891                 for (int i=0;i<processors;i++) { 
892                         int temp = processIDS[i];
893                         wait(&temp);
894                 }
895                 
896                 return exitCommand;
897         #endif          
898         
899         }
900         catch(exception& e) {
901                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
902                 exit(1);
903         }
904 }
905 //**********************************************************************************************************************
906
907 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
908         try {
909                 Cluster* cluster;
910                 SparseMatrix* matrix;
911                 ListVector* list;
912                 ListVector oldList;
913                 RAbundVector* rabund;
914                 
915                 vector<string> listFileNames;
916                 
917                 double smallestCutoff = cutoff;
918                 
919                 //cluster each distance file
920                 for (int i = 0; i < distNames.size(); i++) {
921                         if (m->control_pressed) { return listFileNames; }
922                         
923                         string thisNamefile = distNames[i].begin()->second;
924                         string thisDistFile = distNames[i].begin()->first;
925                         
926                         //read in distance file
927                         globaldata->setNameFile(thisNamefile);
928                         globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
929                         
930                         #ifdef USE_MPI
931                                 int pid;
932                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
933                                 
934                                 //output your files too
935                                 if (pid != 0) {
936                                         cout << endl << "Reading " << thisDistFile << endl;
937                                 }
938                         #endif
939                         
940                         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
941                         
942                         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
943                         read->setCutoff(cutoff);
944
945                         NameAssignment* nameMap = new NameAssignment(thisNamefile);
946                         nameMap->readMap();
947                         read->read(nameMap);
948                         
949                         if (m->control_pressed) {  delete read; delete nameMap; return listFileNames; }
950                         
951                         list = read->getListVector();
952                         oldList = *list;
953                         matrix = read->getMatrix();
954                         
955                         delete read; 
956                         delete nameMap; 
957                         
958                         
959                         #ifdef USE_MPI
960                                 //output your files too
961                                 if (pid != 0) {
962                                         cout << endl << "Clustering " << thisDistFile << endl;
963                                 }
964                         #endif
965                         
966                         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
967                 
968                         rabund = new RAbundVector(list->getRAbundVector());
969                         
970                         //create cluster
971                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
972                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
973                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
974                         tag = cluster->getTag();
975                 
976                         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
977                         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
978                         
979                         ofstream listFile;
980                         m->openOutputFile(fileroot+ tag + ".list",      listFile);
981                 
982                         listFileNames.push_back(fileroot+ tag + ".list");
983                 
984                         time_t estart = time(NULL);
985                         
986                         float previousDist = 0.00000;
987                         float rndPreviousDist = 0.00000;
988                         
989                         oldList = *list;
990
991                         print_start = true;
992                         start = time(NULL);
993                         double saveCutoff = cutoff;
994                 
995                         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
996                 
997                                 if (m->control_pressed) { //clean up
998                                         delete matrix; delete list;     delete cluster; delete rabund;
999                                         listFile.close();
1000                                         for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
1001                                         listFileNames.clear(); return listFileNames;
1002                                 }
1003                 
1004                                 cluster->update(saveCutoff);
1005         
1006                                 float dist = matrix->getSmallDist();
1007                                 float rndDist;
1008                                 if (hard) {
1009                                         rndDist = m->ceilDist(dist, precision); 
1010                                 }else{
1011                                         rndDist = m->roundDist(dist, precision); 
1012                                 }
1013
1014                                 if(previousDist <= 0.0000 && dist != previousDist){
1015                                         oldList.setLabel("unique");
1016                                         oldList.print(listFile);
1017                                         if (labels.count("unique") == 0) {  labels.insert("unique");  }
1018                                 }
1019                                 else if(rndDist != rndPreviousDist){
1020                                         oldList.setLabel(toString(rndPreviousDist,  length-1));
1021                                         oldList.print(listFile);
1022                                         if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1023                                 }
1024                 
1025                                 previousDist = dist;
1026                                 rndPreviousDist = rndDist;
1027                                 oldList = *list;
1028                         }
1029
1030                 
1031                         if(previousDist <= 0.0000){
1032                                 oldList.setLabel("unique");
1033                                 oldList.print(listFile);
1034                                 if (labels.count("unique") == 0) { labels.insert("unique"); }
1035                         }
1036                         else if(rndPreviousDist<cutoff){
1037                                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1038                                 oldList.print(listFile);
1039                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1040                         }
1041         
1042                         delete matrix; delete list;     delete cluster; delete rabund; 
1043                         listFile.close();
1044                         
1045                         if (m->control_pressed) { //clean up
1046                                 for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
1047                                 listFileNames.clear(); return listFileNames;
1048                         }
1049                         
1050                         remove(thisDistFile.c_str());
1051                         remove(thisNamefile.c_str());
1052                         
1053                         if (saveCutoff != cutoff) { 
1054                                 if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1055                                 else            {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1056                         
1057                                 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  
1058                         }
1059                         
1060                         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1061                 }
1062                 
1063                 cutoff = smallestCutoff;
1064                                         
1065                 return listFileNames;
1066         
1067         }
1068         catch(exception& e) {
1069                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1070                 exit(1);
1071         }
1072
1073
1074 }
1075
1076 //**********************************************************************************************************************