]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
added consensus.seqs command
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
17
18
19 //**********************************************************************************************************************
20 vector<string> ClusterSplitCommand::getValidParameters(){       
21         try {
22                 string AlignArray[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
23                 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
24                 return myArray;
25         }
26         catch(exception& e) {
27                 m->errorOut(e, "ClusterSplitCommand", "getValidParameters");
28                 exit(1);
29         }
30 }
31 //**********************************************************************************************************************
32 ClusterSplitCommand::ClusterSplitCommand(){     
33         try {
34                 abort = true;
35                 //initialize outputTypes
36                 vector<string> tempOutNames;
37                 outputTypes["list"] = tempOutNames;
38                 outputTypes["rabund"] = tempOutNames;
39                 outputTypes["sabund"] = tempOutNames;
40                 outputTypes["column"] = tempOutNames;
41         }
42         catch(exception& e) {
43                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
44                 exit(1);
45         }
46 }
47 //**********************************************************************************************************************
48 vector<string> ClusterSplitCommand::getRequiredParameters(){    
49         try {
50                 string Array[] =  {"fasta","phylip","column","or"};
51                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
52                 return myArray;
53         }
54         catch(exception& e) {
55                 m->errorOut(e, "ClusterSplitCommand", "getRequiredParameters");
56                 exit(1);
57         }
58 }
59 //**********************************************************************************************************************
60 vector<string> ClusterSplitCommand::getRequiredFiles(){ 
61         try {
62                 vector<string> myArray;
63                 return myArray;
64         }
65         catch(exception& e) {
66                 m->errorOut(e, "ClusterSplitCommand", "getRequiredFiles");
67                 exit(1);
68         }
69 }
70 //**********************************************************************************************************************
71 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
72 ClusterSplitCommand::ClusterSplitCommand(string option)  {
73         try{
74                 globaldata = GlobalData::getInstance();
75                 abort = false;
76                 format = "";
77                 
78                 //allow user to run help
79                 if(option == "help") { help(); abort = true; }
80                 
81                 else {
82                         //valid paramters for this command
83                         string Array[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
84                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
85                         
86                         OptionParser parser(option);
87                         map<string,string> parameters = parser.getParameters();
88                         
89                         ValidParameters validParameter("cluster.split");
90                 
91                         //check to make sure all parameters are valid for command
92                         map<string,string>::iterator it;
93                         for (it = parameters.begin(); it != parameters.end(); it++) { 
94                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
95                                         abort = true;
96                                 }
97                         }
98                         
99                         //initialize outputTypes
100                         vector<string> tempOutNames;
101                         outputTypes["list"] = tempOutNames;
102                         outputTypes["rabund"] = tempOutNames;
103                         outputTypes["sabund"] = tempOutNames;
104                         outputTypes["column"] = tempOutNames;
105                         
106                         globaldata->newRead();
107                         
108                         //if the user changes the output directory command factory will send this info to us in the output parameter 
109                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
110                         
111                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
112                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
113                         if (inputDir == "not found"){   inputDir = "";          }
114                         else {
115                                 string path;
116                                 it = parameters.find("phylip");
117                                 //user has given a template file
118                                 if(it != parameters.end()){ 
119                                         path = m->hasPath(it->second);
120                                         //if the user has not given a path then, add inputdir. else leave path alone.
121                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
122                                 }
123                                 
124                                 it = parameters.find("column");
125                                 //user has given a template file
126                                 if(it != parameters.end()){ 
127                                         path = m->hasPath(it->second);
128                                         //if the user has not given a path then, add inputdir. else leave path alone.
129                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
130                                 }
131                                 
132                                 it = parameters.find("name");
133                                 //user has given a template file
134                                 if(it != parameters.end()){ 
135                                         path = m->hasPath(it->second);
136                                         //if the user has not given a path then, add inputdir. else leave path alone.
137                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
138                                 }
139                                 
140                                 it = parameters.find("taxonomy");
141                                 //user has given a template file
142                                 if(it != parameters.end()){ 
143                                         path = m->hasPath(it->second);
144                                         //if the user has not given a path then, add inputdir. else leave path alone.
145                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
146                                 }
147                                 
148                                 it = parameters.find("fasta");
149                                 //user has given a template file
150                                 if(it != parameters.end()){ 
151                                         path = m->hasPath(it->second);
152                                         //if the user has not given a path then, add inputdir. else leave path alone.
153                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
154                                 }
155                         }
156                         
157                         //check for required parameters
158                         phylipfile = validParameter.validFile(parameters, "phylip", true);
159                         if (phylipfile == "not open") { abort = true; }
160                         else if (phylipfile == "not found") { phylipfile = ""; }        
161                         else {  distfile = phylipfile;  format = "phylip";      }
162                         
163                         columnfile = validParameter.validFile(parameters, "column", true);
164                         if (columnfile == "not open") { abort = true; } 
165                         else if (columnfile == "not found") { columnfile = ""; }
166                         else {  distfile = columnfile; format = "column";       }
167                         
168                         namefile = validParameter.validFile(parameters, "name", true);
169                         if (namefile == "not open") { abort = true; }   
170                         else if (namefile == "not found") { namefile = ""; }
171                         
172                         fastafile = validParameter.validFile(parameters, "fasta", true);
173                         if (fastafile == "not open") { abort = true; }  
174                         else if (fastafile == "not found") { fastafile = ""; }
175                         else { distfile = fastafile;  splitmethod = "fasta";  }
176                         
177                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
178                         if (taxFile == "not open") { abort = true; }    
179                         else if (taxFile == "not found") { taxFile = ""; }
180                         
181                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
182                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
183                 
184                         if (columnfile != "") {
185                                 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
186                         }
187                         
188                         if (fastafile != "") {
189                                 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
190                                 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
191                         }
192                                         
193                         //check for optional parameter and set defaults
194                         // ...at some point should added some additional type checking...
195                         //get user cutoff and precision or use defaults
196                         string temp;
197                         temp = validParameter.validFile(parameters, "precision", false);
198                         if (temp == "not found") { temp = "100"; }
199                         //saves precision legnth for formatting below
200                         length = temp.length();
201                         convert(temp, precision); 
202                         
203                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
204                         hard = m->isTrue(temp);
205                         
206                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
207                         large = m->isTrue(temp);
208                         
209                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
210                         convert(temp, processors); 
211                         
212                         temp = validParameter.validFile(parameters, "splitmethod", false);      
213                         if (splitmethod != "fasta") {
214                                 if (temp == "not found")  { splitmethod = "distance"; }
215                                 else {  splitmethod = temp; }
216                         }
217                         
218                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
219                         convert(temp, cutoff); 
220                         cutoff += (5 / (precision * 10.0));  
221                         
222                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
223                         convert(temp, taxLevelCutoff); 
224                         
225                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
226                         
227                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
228                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
229                         
230                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
231                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
232                         
233                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
234
235                         showabund = validParameter.validFile(parameters, "showabund", false);
236                         if (showabund == "not found") { showabund = "T"; }
237
238                         timing = validParameter.validFile(parameters, "timing", false);
239                         if (timing == "not found") { timing = "F"; }
240                         
241                 }
242         }
243         catch(exception& e) {
244                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
245                 exit(1);
246         }
247 }
248
249 //**********************************************************************************************************************
250
251 void ClusterSplitCommand::help(){
252         try {
253                 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
254                 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
255                 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
256                 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n");
257                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n");
258                 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n");
259                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
260                 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
261                 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
262                 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
263                 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
264                 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
265                 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
266                 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
267                 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
268                 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n");
269                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
270                 #ifdef USE_MPI
271                 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
272                 #endif
273                 m->mothurOut("The cluster.split command should be in the following format: \n");
274                 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
275                 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");       
276
277         }
278         catch(exception& e) {
279                 m->errorOut(e, "ClusterSplitCommand", "help");
280                 exit(1);
281         }
282 }
283
284 //**********************************************************************************************************************
285
286 ClusterSplitCommand::~ClusterSplitCommand(){}
287
288 //**********************************************************************************************************************
289
290 int ClusterSplitCommand::execute(){
291         try {
292         
293                 if (abort == true) {    return 0;       }
294                 
295                 time_t estart;
296                 vector<string> listFileNames;
297                 set<string> labels;
298                 string singletonName = "";
299                 double saveCutoff = cutoff;
300
301                 //****************** file prep work ******************************//
302                 #ifdef USE_MPI
303                         int pid;
304                         int tag = 2001;
305                         MPI_Status status; 
306                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
307                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
308                         
309                         if (pid == 0) { //only process 0 converts and splits
310                         
311                 #endif
312                 
313                 //if user gave a phylip file convert to column file
314                 if (format == "phylip") {
315                         estart = time(NULL);
316                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
317                         
318                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
319                         
320                         NameAssignment* nameMap = NULL;
321                         convert->setFormat("phylip");
322                         convert->read(nameMap);
323                         
324                         if (m->control_pressed) {  delete convert;  return 0;  }
325                         
326                         distfile = convert->getOutputFile();
327                 
328                         //if no names file given with phylip file, create it
329                         ListVector* listToMakeNameFile =  convert->getListVector();
330                         if (namefile == "") {  //you need to make a namefile for split matrix
331                                 ofstream out;
332                                 namefile = phylipfile + ".names";
333                                 m->openOutputFile(namefile, out);
334                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
335                                         string bin = listToMakeNameFile->get(i);
336                                         out << bin << '\t' << bin << endl;
337                                 }
338                                 out.close();
339                         }
340                         delete listToMakeNameFile;
341                         delete convert;
342                         
343                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
344                 }
345                 if (m->control_pressed) { return 0; }
346                 
347                 estart = time(NULL);
348                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
349                 
350                 //split matrix into non-overlapping groups
351                 SplitMatrix* split;
352                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
353                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
354                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir);      }
355                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
356                 
357                 split->split();
358                 
359                 if (m->control_pressed) { delete split; return 0; }
360                 
361                 singletonName = split->getSingletonNames();
362                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
363                 delete split;
364                 
365                 //output a merged distance file
366                 if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
367                         
368                                 
369                 if (m->control_pressed) { return 0; }
370                 
371                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
372                 estart = time(NULL);
373                 
374                 //****************** break up files between processes and cluster each file set ******************************//
375         #ifdef USE_MPI
376                         ////you are process 0 from above////
377                         
378                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...                           
379                         dividedNames.resize(processors);
380                                         
381                         //for each file group figure out which process will complete it
382                         //want to divide the load intelligently so the big files are spread between processes
383                         int count = 1;
384                         for (int i = 0; i < distName.size(); i++) { 
385                                 int processToAssign = (i+1) % processors; 
386                                 if (processToAssign == 0) { processToAssign = processors; }
387                                                 
388                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
389                         }
390                                         
391                         //not lets reverse the order of ever other process, so we balance big files running with little ones
392                         for (int i = 0; i < processors; i++) {
393                                 int remainder = ((i+1) % processors);
394                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
395                         }
396                         
397                         
398                         //send each child the list of files it needs to process
399                         for(int i = 1; i < processors; i++) { 
400                                 //send number of file pairs
401                                 int num = dividedNames[i].size();
402                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
403                                 
404                                 for (int j = 0; j < num; j++) { //send filenames to process i
405                                         char tempDistFileName[1024];
406                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
407                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
408                                         
409                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
410                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
411                                         
412                                         char tempNameFileName[1024];
413                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
414                                         int lengthName = (dividedNames[i][j].begin()->second).length();
415
416                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
417                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
418                                 }
419                         }
420                         
421                         //process your share
422                         listFileNames = cluster(dividedNames[0], labels);
423                         
424                         //receive the other processes info
425                         for(int i = 1; i < processors; i++) { 
426                                 int num = dividedNames[i].size();
427                                 
428                                 double tempCutoff;
429                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
430                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
431                                 
432                                 //send list filenames to root process
433                                 for (int j = 0; j < num; j++) {  
434                                         int lengthList = 0;
435                                         char tempListFileName[1024];
436                                 
437                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
438                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
439                                 
440                                         string myListFileName = tempListFileName;
441                                         myListFileName = myListFileName.substr(0, lengthList);
442                                         
443                                         listFileNames.push_back(myListFileName);
444                                 }
445                                 
446                                 //send Labels to root process
447                                 int numLabels = 0;
448                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
449                                 
450                                 for (int j = 0; j < numLabels; j++) {  
451                                         int lengthLabel = 0;
452                                         char tempLabel[100];
453                                 
454                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
455                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
456                                 
457                                         string myLabel = tempLabel;
458                                         myLabel = myLabel.substr(0, lengthLabel);
459                         
460                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
461                                 }
462                         }
463                         
464                 }else { //you are a child process
465                         vector < map<string, string> >  myNames;
466                         
467                         //recieve the files you need to process
468                         //receive number of file pairs
469                         int num = 0;
470                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
471                         
472                         myNames.resize(num);
473         
474                         for (int j = 0; j < num; j++) { //receive filenames to process 
475                                 int lengthDist = 0;
476                                 char tempDistFileName[1024];
477                                 
478                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
479                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
480                                 
481                                 string myDistFileName = tempDistFileName;
482                                 myDistFileName = myDistFileName.substr(0, lengthDist);
483                         
484                                 int lengthName = 0;
485                                 char tempNameFileName[1024];
486                                 
487                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
488                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
489                                 
490                                 string myNameFileName = tempNameFileName;
491                                 myNameFileName = myNameFileName.substr(0, lengthName);
492                                 
493                                 //save file name
494                                 myNames[j][myDistFileName] = myNameFileName;
495                         }
496         
497                         //process them
498                         listFileNames = cluster(myNames, labels);
499                         
500                         //send cutoff
501                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
502                         
503                         //send list filenames to root process
504                         for (int j = 0; j < num; j++) {  
505                                 char tempListFileName[1024];
506                                 strcpy(tempListFileName, listFileNames[j].c_str());
507                                 int lengthList = listFileNames[j].length();
508                                         
509                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
510                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
511                         }
512                         
513                         //send Labels to root process
514                         int numLabels = labels.size();
515                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
516                         
517                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
518                                 char tempLabel[100];
519                                 strcpy(tempLabel, (*it).c_str());
520                                 int lengthLabel = (*it).length();
521                                         
522                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
523                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
524                         }
525                 }
526                 
527                 //make everyone wait
528                 MPI_Barrier(MPI_COMM_WORLD);
529                 
530         #else
531
532                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
533                                 if(processors == 1){
534                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
535                                 }else{
536                                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
537                                         dividedNames.resize(processors);
538                                         
539                                         //for each file group figure out which process will complete it
540                                         //want to divide the load intelligently so the big files are spread between processes
541                                         int count = 1;
542                                         for (int i = 0; i < distName.size(); i++) { 
543                                                 int processToAssign = (i+1) % processors; 
544                                                 if (processToAssign == 0) { processToAssign = processors; }
545                                                 
546                                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
547                                         }
548                                         
549                                         //not lets reverse the order of ever other process, so we balance big files running with little ones
550                                         for (int i = 0; i < processors; i++) {
551                                                 int remainder = ((i+1) % processors);
552                                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
553                                         }
554                                         
555                                         createProcesses(dividedNames);
556                                                         
557                                         if (m->control_pressed) { return 0; }
558
559                                         //get list of list file names from each process
560                                         for(int i=0;i<processors;i++){
561                                                 string filename = toString(processIDS[i]) + ".temp";
562                                                 ifstream in;
563                                                 m->openInputFile(filename, in);
564                                                 
565                                                 in >> tag; m->gobble(in);
566                                                 
567                                                 while(!in.eof()) {
568                                                         string tempName;
569                                                         in >> tempName; m->gobble(in);
570                                                         listFileNames.push_back(tempName);
571                                                 }
572                                                 in.close();
573                                                 remove((toString(processIDS[i]) + ".temp").c_str());
574                                                 
575                                                 //get labels
576                                                 filename = toString(processIDS[i]) + ".temp.labels";
577                                                 ifstream in2;
578                                                 m->openInputFile(filename, in2);
579                                                 
580                                                 float tempCutoff;
581                                                 in2 >> tempCutoff; m->gobble(in2);
582                                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
583                                                 
584                                                 while(!in2.eof()) {
585                                                         string tempName;
586                                                         in2 >> tempName; m->gobble(in2);
587                                                         if (labels.count(tempName) == 0) { labels.insert(tempName); }
588                                                 }
589                                                 in2.close();
590                                                 remove((toString(processIDS[i]) + ".temp.labels").c_str());
591                                         }
592                                 }
593                 #else
594                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
595                 #endif
596         #endif  
597                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
598                 
599                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
600                 
601                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
602                 
603                 //****************** merge list file and create rabund and sabund files ******************************//
604                 estart = time(NULL);
605                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
606                 
607                 #ifdef USE_MPI
608                         if (pid == 0) { //only process 0 merges
609                 #endif
610
611                 ListVector* listSingle;
612                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
613                 
614                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
615                 
616                 mergeLists(listFileNames, labelBins, listSingle);
617
618                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
619                 
620                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
621                 
622                 m->mothurOutEndLine();
623                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
624                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
625                 m->mothurOutEndLine();
626                 
627                 #ifdef USE_MPI
628                         } //only process 0 merges
629                         
630                         //make everyone wait
631                         MPI_Barrier(MPI_COMM_WORLD);
632                 #endif
633
634                 return 0;
635         }
636         catch(exception& e) {
637                 m->errorOut(e, "ClusterSplitCommand", "execute");
638                 exit(1);
639         }
640 }
641 //**********************************************************************************************************************
642 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
643         try {
644                                 
645                 map<float, int> labelBin;
646                 vector<float> orderFloat;
647                 int numSingleBins;
648                 
649                 //read in singletons
650                 if (singleton != "none") {
651                         ifstream in;
652                         m->openInputFile(singleton, in);
653                                 
654                         string firstCol, secondCol;
655                         listSingle = new ListVector();
656                         while (!in.eof()) {
657                                 in >> firstCol >> secondCol; m->gobble(in);
658                                 listSingle->push_back(secondCol);
659                         }
660                         in.close();
661                         remove(singleton.c_str());
662                         
663                         numSingleBins = listSingle->getNumBins();
664                 }else{  listSingle = NULL; numSingleBins = 0;  }
665                 
666                 //go through users set and make them floats so we can sort them 
667                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
668                         float temp = -10.0;
669
670                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
671                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
672                         
673                         if (temp <= cutoff) {
674                                 orderFloat.push_back(temp);
675                                 labelBin[temp] = numSingleBins; //initialize numbins 
676                         }
677                 }
678         
679                 //sort order
680                 sort(orderFloat.begin(), orderFloat.end());
681                 userLabels.clear();
682                         
683                 //get the list info from each file
684                 for (int k = 0; k < listNames.size(); k++) {
685         
686                         if (m->control_pressed) {  
687                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
688                                 for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
689                                 return labelBin;
690                         }
691                         
692                         InputData* input = new InputData(listNames[k], "list");
693                         ListVector* list = input->getListVector();
694                         string lastLabel = list->getLabel();
695                         
696                         string filledInList = listNames[k] + "filledInTemp";
697                         ofstream outFilled;
698                         m->openOutputFile(filledInList, outFilled);
699         
700                         //for each label needed
701                         for(int l = 0; l < orderFloat.size(); l++){
702                         
703                                 string thisLabel;
704                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
705                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
706
707                                 //this file has reached the end
708                                 if (list == NULL) { 
709                                         list = input->getListVector(lastLabel, true); 
710                                 }else{  //do you have the distance, or do you need to fill in
711                                                 
712                                         float labelFloat;
713                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
714                                         else { convert(list->getLabel(), labelFloat); }
715
716                                         //check for missing labels
717                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
718                                                 //if its bigger get last label, otherwise keep it
719                                                 delete list;
720                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
721                                         }
722                                         lastLabel = list->getLabel();
723                                 }
724                                 
725                                 //print to new file
726                                 list->setLabel(thisLabel);
727                                 list->print(outFilled);
728                 
729                                 //update labelBin
730                                 labelBin[orderFloat[l]] += list->getNumBins();
731                                                                         
732                                 delete list;
733                                                                         
734                                 list = input->getListVector();
735                         }
736                         
737                         if (list != NULL) { delete list; }
738                         delete input;
739                         
740                         outFilled.close();
741                         remove(listNames[k].c_str());
742                         rename(filledInList.c_str(), listNames[k].c_str());
743                 }
744                 
745                 return labelBin;
746         }
747         catch(exception& e) {
748                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
749                 exit(1);
750         }
751 }
752 //**********************************************************************************************************************
753 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
754         try {
755                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
756                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
757                 
758                 m->openOutputFile(fileroot+ tag + ".sabund",    outSabund);
759                 m->openOutputFile(fileroot+ tag + ".rabund",    outRabund);
760                 m->openOutputFile(fileroot+ tag + ".list",              outList);
761                                 
762                 outputNames.push_back(fileroot+ tag + ".sabund");  outputTypes["list"].push_back(fileroot+ tag + ".list");
763                 outputNames.push_back(fileroot+ tag + ".rabund");  outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
764                 outputNames.push_back(fileroot+ tag + ".list");    outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
765                 
766                 map<float, int>::iterator itLabel;
767
768                 //for each label needed
769                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
770                         
771                         string thisLabel;
772                         if (itLabel->first == -1) { thisLabel = "unique"; }
773                         else { thisLabel = toString(itLabel->first,  length-1);  } 
774                         
775                         outList << thisLabel << '\t' << itLabel->second << '\t';
776
777                         RAbundVector* rabund = new RAbundVector();
778                         rabund->setLabel(thisLabel);
779
780                         //add in singletons
781                         if (listSingle != NULL) {
782                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
783                                         outList << listSingle->get(j) << '\t';
784                                         rabund->push_back(m->getNumNames(listSingle->get(j)));
785                                 }
786                         }
787                         
788                         //get the list info from each file
789                         for (int k = 0; k < listNames.size(); k++) {
790         
791                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
792                                 
793                                 InputData* input = new InputData(listNames[k], "list");
794                                 ListVector* list = input->getListVector(thisLabel);
795                                 
796                                 //this file has reached the end
797                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
798                                 else {          
799                                         for (int j = 0; j < list->getNumBins(); j++) {
800                                                 outList << list->get(j) << '\t';
801                                                 rabund->push_back(m->getNumNames(list->get(j)));
802                                         }
803                                         delete list;
804                                 }
805                                 delete input;
806                         }
807                         
808                         SAbundVector sabund = rabund->getSAbundVector();
809                         
810                         sabund.print(outSabund);
811                         rabund->print(outRabund);
812                         outList << endl;
813                         
814                         delete rabund;
815                 }
816                 
817                 outList.close();
818                 outRabund.close();
819                 outSabund.close();
820                 
821                 if (listSingle != NULL) { delete listSingle;  }
822                 
823                 for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
824                 
825                 return 0;
826         }
827         catch(exception& e) {
828                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
829                 exit(1);
830         }
831 }
832
833 //**********************************************************************************************************************
834
835 void ClusterSplitCommand::printData(ListVector* oldList){
836         try {
837                 string label = oldList->getLabel();
838                 RAbundVector oldRAbund = oldList->getRAbundVector();
839                 
840                 oldRAbund.setLabel(label);
841                 if (m->isTrue(showabund)) {
842                         oldRAbund.getSAbundVector().print(cout);
843                 }
844                 oldRAbund.print(outRabund);
845                 oldRAbund.getSAbundVector().print(outSabund);
846         
847                 oldList->print(outList);
848         }
849         catch(exception& e) {
850                 m->errorOut(e, "ClusterSplitCommand", "printData");
851                 exit(1);
852         }
853 }
854 //**********************************************************************************************************************
855 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
856         try {
857         
858         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
859                 int process = 0;
860                 int exitCommand = 1;
861                 processIDS.clear();
862                 
863                 //loop through and create all the processes you want
864                 while (process != processors) {
865                         int pid = fork();
866                         
867                         if (pid > 0) {
868                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
869                                 process++;
870                         }else if (pid == 0){
871                                 set<string> labels;
872                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
873                                 
874                                 //write out names to file
875                                 string filename = toString(getpid()) + ".temp";
876                                 ofstream out;
877                                 m->openOutputFile(filename, out);
878                                 out << tag << endl;
879                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
880                                 out.close();
881                                 
882                                 //print out labels
883                                 ofstream outLabels;
884                                 filename = toString(getpid()) + ".temp.labels";
885                                 m->openOutputFile(filename, outLabels);
886                                 
887                                 outLabels << cutoff << endl;
888                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
889                                         outLabels << (*it) << endl;
890                                 }
891                                 outLabels.close();
892
893                                 exit(0);
894                         }else { 
895                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
896                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
897                                 exit(0);
898                         }
899                 }
900                 
901                 //force parent to wait until all the processes are done
902                 for (int i=0;i<processors;i++) { 
903                         int temp = processIDS[i];
904                         wait(&temp);
905                 }
906                 
907                 return exitCommand;
908         #endif          
909         
910         }
911         catch(exception& e) {
912                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
913                 exit(1);
914         }
915 }
916 //**********************************************************************************************************************
917
918 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
919         try {
920                 Cluster* cluster;
921                 SparseMatrix* matrix;
922                 ListVector* list;
923                 ListVector oldList;
924                 RAbundVector* rabund;
925                 
926                 vector<string> listFileNames;
927                 
928                 double smallestCutoff = cutoff;
929                 
930                 //cluster each distance file
931                 for (int i = 0; i < distNames.size(); i++) {
932                         if (m->control_pressed) { return listFileNames; }
933                         
934                         string thisNamefile = distNames[i].begin()->second;
935                         string thisDistFile = distNames[i].begin()->first;
936                         
937                         //read in distance file
938                         globaldata->setNameFile(thisNamefile);
939                         globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
940                         
941                         #ifdef USE_MPI
942                                 int pid;
943                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
944                                 
945                                 //output your files too
946                                 if (pid != 0) {
947                                         cout << endl << "Reading " << thisDistFile << endl;
948                                 }
949                         #endif
950                         
951                         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
952                         
953                         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
954                         read->setCutoff(cutoff);
955
956                         NameAssignment* nameMap = new NameAssignment(thisNamefile);
957                         nameMap->readMap();
958                         read->read(nameMap);
959                         
960                         if (m->control_pressed) {  delete read; delete nameMap; return listFileNames; }
961                         
962                         list = read->getListVector();
963                         oldList = *list;
964                         matrix = read->getMatrix();
965                         
966                         delete read; 
967                         delete nameMap; 
968                         
969                         
970                         #ifdef USE_MPI
971                                 //output your files too
972                                 if (pid != 0) {
973                                         cout << endl << "Clustering " << thisDistFile << endl;
974                                 }
975                         #endif
976                         
977                         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
978                 
979                         rabund = new RAbundVector(list->getRAbundVector());
980                         
981                         //create cluster
982                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
983                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
984                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
985                         tag = cluster->getTag();
986                 
987                         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
988                         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
989                         
990                         ofstream listFile;
991                         m->openOutputFile(fileroot+ tag + ".list",      listFile);
992                 
993                         listFileNames.push_back(fileroot+ tag + ".list");
994                 
995                         time_t estart = time(NULL);
996                         
997                         float previousDist = 0.00000;
998                         float rndPreviousDist = 0.00000;
999                         
1000                         oldList = *list;
1001
1002                         print_start = true;
1003                         start = time(NULL);
1004                         double saveCutoff = cutoff;
1005                 
1006                         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1007                 
1008                                 if (m->control_pressed) { //clean up
1009                                         delete matrix; delete list;     delete cluster; delete rabund;
1010                                         listFile.close();
1011                                         for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
1012                                         listFileNames.clear(); return listFileNames;
1013                                 }
1014                 
1015                                 cluster->update(saveCutoff);
1016         
1017                                 float dist = matrix->getSmallDist();
1018                                 float rndDist;
1019                                 if (hard) {
1020                                         rndDist = m->ceilDist(dist, precision); 
1021                                 }else{
1022                                         rndDist = m->roundDist(dist, precision); 
1023                                 }
1024
1025                                 if(previousDist <= 0.0000 && dist != previousDist){
1026                                         oldList.setLabel("unique");
1027                                         oldList.print(listFile);
1028                                         if (labels.count("unique") == 0) {  labels.insert("unique");  }
1029                                 }
1030                                 else if(rndDist != rndPreviousDist){
1031                                         oldList.setLabel(toString(rndPreviousDist,  length-1));
1032                                         oldList.print(listFile);
1033                                         if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1034                                 }
1035                 
1036                                 previousDist = dist;
1037                                 rndPreviousDist = rndDist;
1038                                 oldList = *list;
1039                         }
1040
1041                 
1042                         if(previousDist <= 0.0000){
1043                                 oldList.setLabel("unique");
1044                                 oldList.print(listFile);
1045                                 if (labels.count("unique") == 0) { labels.insert("unique"); }
1046                         }
1047                         else if(rndPreviousDist<cutoff){
1048                                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1049                                 oldList.print(listFile);
1050                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1051                         }
1052         
1053                         delete matrix; delete list;     delete cluster; delete rabund; 
1054                         listFile.close();
1055                         
1056                         if (m->control_pressed) { //clean up
1057                                 for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
1058                                 listFileNames.clear(); return listFileNames;
1059                         }
1060                         
1061                         remove(thisDistFile.c_str());
1062                         remove(thisNamefile.c_str());
1063                         
1064                         if (saveCutoff != cutoff) { 
1065                                 if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1066                                 else            {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1067                         
1068                                 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  
1069                         }
1070                         
1071                         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1072                 }
1073                 
1074                 cutoff = smallestCutoff;
1075                                         
1076                 return listFileNames;
1077         
1078         }
1079         catch(exception& e) {
1080                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1081                 exit(1);
1082         }
1083
1084
1085 }
1086 //**********************************************************************************************************************
1087
1088 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1089         try{
1090                 
1091 #ifdef USE_MPI
1092                 int pid;
1093                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1094                 
1095                 if (pid != 0) {
1096 #endif
1097                 
1098                 string thisOutputDir = outputDir;
1099                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1100                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
1101                 remove(outputFileName.c_str());
1102                 
1103                 
1104                 for (int i = 0; i < distNames.size(); i++) {
1105                         if (m->control_pressed) {  return 0; }
1106                         
1107                         string thisDistFile = distNames[i].begin()->first;
1108                         
1109                         m->appendFiles(thisDistFile, outputFileName);
1110                 }       
1111                         
1112                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1113                         
1114 #ifdef USE_MPI
1115                 }
1116 #endif
1117                                 
1118                         
1119                 
1120                 
1121         }
1122         catch(exception& e) {
1123                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1124                 exit(1);
1125         }
1126 }
1127 //**********************************************************************************************************************