]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
working on chimera change to add trim feature, fixed bug in print of distance file...
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
17
18
19 //**********************************************************************************************************************
20 vector<string> ClusterSplitCommand::getValidParameters(){       
21         try {
22                 string AlignArray[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
23                 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
24                 return myArray;
25         }
26         catch(exception& e) {
27                 m->errorOut(e, "ClusterSplitCommand", "getValidParameters");
28                 exit(1);
29         }
30 }
31 //**********************************************************************************************************************
32 ClusterSplitCommand::ClusterSplitCommand(){     
33         try {
34                 abort = true;
35                 //initialize outputTypes
36                 vector<string> tempOutNames;
37                 outputTypes["list"] = tempOutNames;
38                 outputTypes["rabund"] = tempOutNames;
39                 outputTypes["sabund"] = tempOutNames;
40                 outputTypes["column"] = tempOutNames;
41         }
42         catch(exception& e) {
43                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
44                 exit(1);
45         }
46 }
47 //**********************************************************************************************************************
48 vector<string> ClusterSplitCommand::getRequiredParameters(){    
49         try {
50                 string Array[] =  {"fasta","phylip","column","or"};
51                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
52                 return myArray;
53         }
54         catch(exception& e) {
55                 m->errorOut(e, "ClusterSplitCommand", "getRequiredParameters");
56                 exit(1);
57         }
58 }
59 //**********************************************************************************************************************
60 vector<string> ClusterSplitCommand::getRequiredFiles(){ 
61         try {
62                 vector<string> myArray;
63                 return myArray;
64         }
65         catch(exception& e) {
66                 m->errorOut(e, "ClusterSplitCommand", "getRequiredFiles");
67                 exit(1);
68         }
69 }
70 //**********************************************************************************************************************
71 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
72 ClusterSplitCommand::ClusterSplitCommand(string option)  {
73         try{
74                 globaldata = GlobalData::getInstance();
75                 abort = false;
76                 format = "";
77                 
78                 //allow user to run help
79                 if(option == "help") { help(); abort = true; }
80                 
81                 else {
82                         //valid paramters for this command
83                         string Array[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
84                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
85                         
86                         OptionParser parser(option);
87                         map<string,string> parameters = parser.getParameters();
88                         
89                         ValidParameters validParameter("cluster.split");
90                 
91                         //check to make sure all parameters are valid for command
92                         map<string,string>::iterator it;
93                         for (it = parameters.begin(); it != parameters.end(); it++) { 
94                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
95                                         abort = true;
96                                 }
97                         }
98                         
99                         //initialize outputTypes
100                         vector<string> tempOutNames;
101                         outputTypes["list"] = tempOutNames;
102                         outputTypes["rabund"] = tempOutNames;
103                         outputTypes["sabund"] = tempOutNames;
104                         outputTypes["column"] = tempOutNames;
105                         
106                         globaldata->newRead();
107                         
108                         //if the user changes the output directory command factory will send this info to us in the output parameter 
109                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
110                         
111                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
112                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
113                         if (inputDir == "not found"){   inputDir = "";          }
114                         else {
115                                 string path;
116                                 it = parameters.find("phylip");
117                                 //user has given a template file
118                                 if(it != parameters.end()){ 
119                                         path = m->hasPath(it->second);
120                                         //if the user has not given a path then, add inputdir. else leave path alone.
121                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
122                                 }
123                                 
124                                 it = parameters.find("column");
125                                 //user has given a template file
126                                 if(it != parameters.end()){ 
127                                         path = m->hasPath(it->second);
128                                         //if the user has not given a path then, add inputdir. else leave path alone.
129                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
130                                 }
131                                 
132                                 it = parameters.find("name");
133                                 //user has given a template file
134                                 if(it != parameters.end()){ 
135                                         path = m->hasPath(it->second);
136                                         //if the user has not given a path then, add inputdir. else leave path alone.
137                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
138                                 }
139                                 
140                                 it = parameters.find("taxonomy");
141                                 //user has given a template file
142                                 if(it != parameters.end()){ 
143                                         path = m->hasPath(it->second);
144                                         //if the user has not given a path then, add inputdir. else leave path alone.
145                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
146                                 }
147                                 
148                                 it = parameters.find("fasta");
149                                 //user has given a template file
150                                 if(it != parameters.end()){ 
151                                         path = m->hasPath(it->second);
152                                         //if the user has not given a path then, add inputdir. else leave path alone.
153                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
154                                 }
155                         }
156                         
157                         //check for required parameters
158                         phylipfile = validParameter.validFile(parameters, "phylip", true);
159                         if (phylipfile == "not open") { abort = true; }
160                         else if (phylipfile == "not found") { phylipfile = ""; }        
161                         else {  distfile = phylipfile;  format = "phylip";      }
162                         
163                         columnfile = validParameter.validFile(parameters, "column", true);
164                         if (columnfile == "not open") { abort = true; } 
165                         else if (columnfile == "not found") { columnfile = ""; }
166                         else {  distfile = columnfile; format = "column";       }
167                         
168                         namefile = validParameter.validFile(parameters, "name", true);
169                         if (namefile == "not open") { abort = true; }   
170                         else if (namefile == "not found") { namefile = ""; }
171                         
172                         fastafile = validParameter.validFile(parameters, "fasta", true);
173                         if (fastafile == "not open") { abort = true; }  
174                         else if (fastafile == "not found") { fastafile = ""; }
175                         else { distfile = fastafile;  splitmethod = "fasta";  }
176                         
177                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
178                         if (taxFile == "not open") { abort = true; }    
179                         else if (taxFile == "not found") { taxFile = ""; }
180                         
181                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
182                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
183                 
184                         if (columnfile != "") {
185                                 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
186                         }
187                         
188                         if (fastafile != "") {
189                                 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
190                                 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
191                         }
192                                         
193                         //check for optional parameter and set defaults
194                         // ...at some point should added some additional type checking...
195                         //get user cutoff and precision or use defaults
196                         string temp;
197                         temp = validParameter.validFile(parameters, "precision", false);
198                         if (temp == "not found") { temp = "100"; }
199                         //saves precision legnth for formatting below
200                         length = temp.length();
201                         convert(temp, precision); 
202                         
203                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
204                         hard = m->isTrue(temp);
205                         
206                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
207                         large = m->isTrue(temp);
208                         
209                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
210                         convert(temp, processors); 
211                         
212                         temp = validParameter.validFile(parameters, "splitmethod", false);      
213                         if (splitmethod != "fasta") {
214                                 if (temp == "not found")  { splitmethod = "distance"; }
215                                 else {  splitmethod = temp; }
216                         }
217                         
218                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
219                         convert(temp, cutoff); 
220                         cutoff += (5 / (precision * 10.0));  
221                         
222                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
223                         convert(temp, taxLevelCutoff); 
224                         
225                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
226                         
227                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
228                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
229                         
230                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
231                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
232                         
233                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
234
235                         showabund = validParameter.validFile(parameters, "showabund", false);
236                         if (showabund == "not found") { showabund = "T"; }
237
238                         timing = validParameter.validFile(parameters, "timing", false);
239                         if (timing == "not found") { timing = "F"; }
240                         
241                 }
242         }
243         catch(exception& e) {
244                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
245                 exit(1);
246         }
247 }
248
249 //**********************************************************************************************************************
250
251 void ClusterSplitCommand::help(){
252         try {
253                 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
254                 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
255                 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
256                 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n");
257                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n");
258                 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n");
259                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
260                 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
261                 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
262                 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
263                 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
264                 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
265                 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
266                 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
267                 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
268                 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n");
269                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
270                 #ifdef USE_MPI
271                 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
272                 #endif
273                 m->mothurOut("The cluster.split command should be in the following format: \n");
274                 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
275                 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");       
276
277         }
278         catch(exception& e) {
279                 m->errorOut(e, "ClusterSplitCommand", "help");
280                 exit(1);
281         }
282 }
283
284 //**********************************************************************************************************************
285
286 ClusterSplitCommand::~ClusterSplitCommand(){}
287
288 //**********************************************************************************************************************
289
290 int ClusterSplitCommand::execute(){
291         try {
292         
293                 if (abort == true) {    return 0;       }
294                 
295                 time_t estart;
296                 vector<string> listFileNames;
297                 set<string> labels;
298                 string singletonName = "";
299                 double saveCutoff = cutoff;
300
301                 //****************** file prep work ******************************//
302                 #ifdef USE_MPI
303                         int pid;
304                         int tag = 2001;
305                         MPI_Status status; 
306                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
307                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
308                         
309                         if (pid == 0) { //only process 0 converts and splits
310                         
311                 #endif
312                 
313                 //if user gave a phylip file convert to column file
314                 if (format == "phylip") {
315                         estart = time(NULL);
316                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
317                         
318                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
319                         
320                         NameAssignment* nameMap = NULL;
321                         convert->setFormat("phylip");
322                         convert->read(nameMap);
323                         
324                         if (m->control_pressed) {  delete convert;  return 0;  }
325                         
326                         distfile = convert->getOutputFile();
327                 
328                         //if no names file given with phylip file, create it
329                         ListVector* listToMakeNameFile =  convert->getListVector();
330                         if (namefile == "") {  //you need to make a namefile for split matrix
331                                 ofstream out;
332                                 namefile = phylipfile + ".names";
333                                 m->openOutputFile(namefile, out);
334                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
335                                         string bin = listToMakeNameFile->get(i);
336                                         out << bin << '\t' << bin << endl;
337                                 }
338                                 out.close();
339                         }
340                         delete listToMakeNameFile;
341                         delete convert;
342                         
343                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
344                 }
345                 if (m->control_pressed) { return 0; }
346                 
347                 estart = time(NULL);
348                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
349                 
350                 //split matrix into non-overlapping groups
351                 SplitMatrix* split;
352                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
353                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
354                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir);      }
355                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
356                 
357                 split->split();
358                 
359                 if (m->control_pressed) { delete split; return 0; }
360                 
361                 singletonName = split->getSingletonNames();
362                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
363                 delete split;
364                 
365                 //output a merged distance file
366                 if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
367                         
368                                 
369                 if (m->control_pressed) { return 0; }
370                 
371                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
372                 estart = time(NULL);
373                 
374                 //****************** break up files between processes and cluster each file set ******************************//
375         #ifdef USE_MPI
376                         ////you are process 0 from above////
377                         
378                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...                           
379                         dividedNames.resize(processors);
380                                         
381                         //for each file group figure out which process will complete it
382                         //want to divide the load intelligently so the big files are spread between processes
383                         for (int i = 0; i < distName.size(); i++) { 
384                                 int processToAssign = (i+1) % processors; 
385                                 if (processToAssign == 0) { processToAssign = processors; }
386                                                 
387                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
388                         }
389                                         
390                         //not lets reverse the order of ever other process, so we balance big files running with little ones
391                         for (int i = 0; i < processors; i++) {
392                                 int remainder = ((i+1) % processors);
393                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
394                         }
395                         
396                         
397                         //send each child the list of files it needs to process
398                         for(int i = 1; i < processors; i++) { 
399                                 //send number of file pairs
400                                 int num = dividedNames[i].size();
401                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
402                                 
403                                 for (int j = 0; j < num; j++) { //send filenames to process i
404                                         char tempDistFileName[1024];
405                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
406                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
407                                         
408                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
409                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
410                                         
411                                         char tempNameFileName[1024];
412                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
413                                         int lengthName = (dividedNames[i][j].begin()->second).length();
414
415                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
416                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
417                                 }
418                         }
419                         
420                         //process your share
421                         listFileNames = cluster(dividedNames[0], labels);
422                         
423                         //receive the other processes info
424                         for(int i = 1; i < processors; i++) { 
425                                 int num = dividedNames[i].size();
426                                 
427                                 double tempCutoff;
428                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
429                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
430                                 
431                                 //send list filenames to root process
432                                 for (int j = 0; j < num; j++) {  
433                                         int lengthList = 0;
434                                         char tempListFileName[1024];
435                                 
436                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
437                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
438                                 
439                                         string myListFileName = tempListFileName;
440                                         myListFileName = myListFileName.substr(0, lengthList);
441                                         
442                                         listFileNames.push_back(myListFileName);
443                                 }
444                                 
445                                 //send Labels to root process
446                                 int numLabels = 0;
447                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
448                                 
449                                 for (int j = 0; j < numLabels; j++) {  
450                                         int lengthLabel = 0;
451                                         char tempLabel[100];
452                                 
453                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
454                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
455                                 
456                                         string myLabel = tempLabel;
457                                         myLabel = myLabel.substr(0, lengthLabel);
458                         
459                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
460                                 }
461                         }
462                         
463                 }else { //you are a child process
464                         vector < map<string, string> >  myNames;
465                         
466                         //recieve the files you need to process
467                         //receive number of file pairs
468                         int num = 0;
469                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
470                         
471                         myNames.resize(num);
472         
473                         for (int j = 0; j < num; j++) { //receive filenames to process 
474                                 int lengthDist = 0;
475                                 char tempDistFileName[1024];
476                                 
477                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
478                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
479                                 
480                                 string myDistFileName = tempDistFileName;
481                                 myDistFileName = myDistFileName.substr(0, lengthDist);
482                         
483                                 int lengthName = 0;
484                                 char tempNameFileName[1024];
485                                 
486                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
487                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
488                                 
489                                 string myNameFileName = tempNameFileName;
490                                 myNameFileName = myNameFileName.substr(0, lengthName);
491                                 
492                                 //save file name
493                                 myNames[j][myDistFileName] = myNameFileName;
494                         }
495         
496                         //process them
497                         listFileNames = cluster(myNames, labels);
498                         
499                         //send cutoff
500                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
501                         
502                         //send list filenames to root process
503                         for (int j = 0; j < num; j++) {  
504                                 char tempListFileName[1024];
505                                 strcpy(tempListFileName, listFileNames[j].c_str());
506                                 int lengthList = listFileNames[j].length();
507                                         
508                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
509                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
510                         }
511                         
512                         //send Labels to root process
513                         int numLabels = labels.size();
514                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
515                         
516                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
517                                 char tempLabel[100];
518                                 strcpy(tempLabel, (*it).c_str());
519                                 int lengthLabel = (*it).length();
520                                         
521                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
522                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
523                         }
524                 }
525                 
526                 //make everyone wait
527                 MPI_Barrier(MPI_COMM_WORLD);
528                 
529         #else
530
531                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
532                                 if(processors == 1){
533                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
534                                 }else{
535                                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
536                                         dividedNames.resize(processors);
537                                         
538                                         //for each file group figure out which process will complete it
539                                         //want to divide the load intelligently so the big files are spread between processes
540                                         for (int i = 0; i < distName.size(); i++) { 
541                                                 int processToAssign = (i+1) % processors; 
542                                                 if (processToAssign == 0) { processToAssign = processors; }
543                                                 
544                                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
545                                         }
546                                         
547                                         //not lets reverse the order of ever other process, so we balance big files running with little ones
548                                         for (int i = 0; i < processors; i++) {
549                                                 int remainder = ((i+1) % processors);
550                                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
551                                         }
552                                         
553                                         createProcesses(dividedNames);
554                                                         
555                                         if (m->control_pressed) { return 0; }
556
557                                         //get list of list file names from each process
558                                         for(int i=0;i<processors;i++){
559                                                 string filename = toString(processIDS[i]) + ".temp";
560                                                 ifstream in;
561                                                 m->openInputFile(filename, in);
562                                                 
563                                                 in >> tag; m->gobble(in);
564                                                 
565                                                 while(!in.eof()) {
566                                                         string tempName;
567                                                         in >> tempName; m->gobble(in);
568                                                         listFileNames.push_back(tempName);
569                                                 }
570                                                 in.close();
571                                                 remove((toString(processIDS[i]) + ".temp").c_str());
572                                                 
573                                                 //get labels
574                                                 filename = toString(processIDS[i]) + ".temp.labels";
575                                                 ifstream in2;
576                                                 m->openInputFile(filename, in2);
577                                                 
578                                                 float tempCutoff;
579                                                 in2 >> tempCutoff; m->gobble(in2);
580                                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
581                                                 
582                                                 while(!in2.eof()) {
583                                                         string tempName;
584                                                         in2 >> tempName; m->gobble(in2);
585                                                         if (labels.count(tempName) == 0) { labels.insert(tempName); }
586                                                 }
587                                                 in2.close();
588                                                 remove((toString(processIDS[i]) + ".temp.labels").c_str());
589                                         }
590                                 }
591                 #else
592                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
593                 #endif
594         #endif  
595                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
596                 
597                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
598                 
599                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
600                 
601                 //****************** merge list file and create rabund and sabund files ******************************//
602                 estart = time(NULL);
603                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
604                 
605                 #ifdef USE_MPI
606                         if (pid == 0) { //only process 0 merges
607                 #endif
608
609                 ListVector* listSingle;
610                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
611                 
612                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
613                 
614                 mergeLists(listFileNames, labelBins, listSingle);
615
616                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
617                 
618                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
619                 
620                 m->mothurOutEndLine();
621                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
622                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
623                 m->mothurOutEndLine();
624                 
625                 #ifdef USE_MPI
626                         } //only process 0 merges
627                         
628                         //make everyone wait
629                         MPI_Barrier(MPI_COMM_WORLD);
630                 #endif
631
632                 return 0;
633         }
634         catch(exception& e) {
635                 m->errorOut(e, "ClusterSplitCommand", "execute");
636                 exit(1);
637         }
638 }
639 //**********************************************************************************************************************
640 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
641         try {
642                                 
643                 map<float, int> labelBin;
644                 vector<float> orderFloat;
645                 int numSingleBins;
646                 
647                 //read in singletons
648                 if (singleton != "none") {
649                         ifstream in;
650                         m->openInputFile(singleton, in);
651                                 
652                         string firstCol, secondCol;
653                         listSingle = new ListVector();
654                         while (!in.eof()) {
655                                 in >> firstCol >> secondCol; m->gobble(in);
656                                 listSingle->push_back(secondCol);
657                         }
658                         in.close();
659                         remove(singleton.c_str());
660                         
661                         numSingleBins = listSingle->getNumBins();
662                 }else{  listSingle = NULL; numSingleBins = 0;  }
663                 
664                 //go through users set and make them floats so we can sort them 
665                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
666                         float temp = -10.0;
667
668                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
669                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
670                         
671                         if (temp <= cutoff) {
672                                 orderFloat.push_back(temp);
673                                 labelBin[temp] = numSingleBins; //initialize numbins 
674                         }
675                 }
676         
677                 //sort order
678                 sort(orderFloat.begin(), orderFloat.end());
679                 userLabels.clear();
680                         
681                 //get the list info from each file
682                 for (int k = 0; k < listNames.size(); k++) {
683         
684                         if (m->control_pressed) {  
685                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
686                                 for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
687                                 return labelBin;
688                         }
689                         
690                         InputData* input = new InputData(listNames[k], "list");
691                         ListVector* list = input->getListVector();
692                         string lastLabel = list->getLabel();
693                         
694                         string filledInList = listNames[k] + "filledInTemp";
695                         ofstream outFilled;
696                         m->openOutputFile(filledInList, outFilled);
697         
698                         //for each label needed
699                         for(int l = 0; l < orderFloat.size(); l++){
700                         
701                                 string thisLabel;
702                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
703                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
704
705                                 //this file has reached the end
706                                 if (list == NULL) { 
707                                         list = input->getListVector(lastLabel, true); 
708                                 }else{  //do you have the distance, or do you need to fill in
709                                                 
710                                         float labelFloat;
711                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
712                                         else { convert(list->getLabel(), labelFloat); }
713
714                                         //check for missing labels
715                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
716                                                 //if its bigger get last label, otherwise keep it
717                                                 delete list;
718                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
719                                         }
720                                         lastLabel = list->getLabel();
721                                 }
722                                 
723                                 //print to new file
724                                 list->setLabel(thisLabel);
725                                 list->print(outFilled);
726                 
727                                 //update labelBin
728                                 labelBin[orderFloat[l]] += list->getNumBins();
729                                                                         
730                                 delete list;
731                                                                         
732                                 list = input->getListVector();
733                         }
734                         
735                         if (list != NULL) { delete list; }
736                         delete input;
737                         
738                         outFilled.close();
739                         remove(listNames[k].c_str());
740                         rename(filledInList.c_str(), listNames[k].c_str());
741                 }
742                 
743                 return labelBin;
744         }
745         catch(exception& e) {
746                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
747                 exit(1);
748         }
749 }
750 //**********************************************************************************************************************
751 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
752         try {
753                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
754                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
755                 
756                 m->openOutputFile(fileroot+ tag + ".sabund",    outSabund);
757                 m->openOutputFile(fileroot+ tag + ".rabund",    outRabund);
758                 m->openOutputFile(fileroot+ tag + ".list",              outList);
759                                 
760                 outputNames.push_back(fileroot+ tag + ".sabund");  outputTypes["list"].push_back(fileroot+ tag + ".list");
761                 outputNames.push_back(fileroot+ tag + ".rabund");  outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
762                 outputNames.push_back(fileroot+ tag + ".list");    outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
763                 
764                 map<float, int>::iterator itLabel;
765
766                 //for each label needed
767                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
768                         
769                         string thisLabel;
770                         if (itLabel->first == -1) { thisLabel = "unique"; }
771                         else { thisLabel = toString(itLabel->first,  length-1);  } 
772                         
773                         outList << thisLabel << '\t' << itLabel->second << '\t';
774
775                         RAbundVector* rabund = new RAbundVector();
776                         rabund->setLabel(thisLabel);
777
778                         //add in singletons
779                         if (listSingle != NULL) {
780                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
781                                         outList << listSingle->get(j) << '\t';
782                                         rabund->push_back(m->getNumNames(listSingle->get(j)));
783                                 }
784                         }
785                         
786                         //get the list info from each file
787                         for (int k = 0; k < listNames.size(); k++) {
788         
789                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
790                                 
791                                 InputData* input = new InputData(listNames[k], "list");
792                                 ListVector* list = input->getListVector(thisLabel);
793                                 
794                                 //this file has reached the end
795                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
796                                 else {          
797                                         for (int j = 0; j < list->getNumBins(); j++) {
798                                                 outList << list->get(j) << '\t';
799                                                 rabund->push_back(m->getNumNames(list->get(j)));
800                                         }
801                                         delete list;
802                                 }
803                                 delete input;
804                         }
805                         
806                         SAbundVector sabund = rabund->getSAbundVector();
807                         
808                         sabund.print(outSabund);
809                         rabund->print(outRabund);
810                         outList << endl;
811                         
812                         delete rabund;
813                 }
814                 
815                 outList.close();
816                 outRabund.close();
817                 outSabund.close();
818                 
819                 if (listSingle != NULL) { delete listSingle;  }
820                 
821                 for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
822                 
823                 return 0;
824         }
825         catch(exception& e) {
826                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
827                 exit(1);
828         }
829 }
830
831 //**********************************************************************************************************************
832
833 void ClusterSplitCommand::printData(ListVector* oldList){
834         try {
835                 string label = oldList->getLabel();
836                 RAbundVector oldRAbund = oldList->getRAbundVector();
837                 
838                 oldRAbund.setLabel(label);
839                 if (m->isTrue(showabund)) {
840                         oldRAbund.getSAbundVector().print(cout);
841                 }
842                 oldRAbund.print(outRabund);
843                 oldRAbund.getSAbundVector().print(outSabund);
844         
845                 oldList->print(outList);
846         }
847         catch(exception& e) {
848                 m->errorOut(e, "ClusterSplitCommand", "printData");
849                 exit(1);
850         }
851 }
852 //**********************************************************************************************************************
853 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
854         try {
855         
856         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
857                 int process = 0;
858                 int exitCommand = 1;
859                 processIDS.clear();
860                 
861                 //loop through and create all the processes you want
862                 while (process != processors) {
863                         int pid = fork();
864                         
865                         if (pid > 0) {
866                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
867                                 process++;
868                         }else if (pid == 0){
869                                 set<string> labels;
870                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
871                                 
872                                 //write out names to file
873                                 string filename = toString(getpid()) + ".temp";
874                                 ofstream out;
875                                 m->openOutputFile(filename, out);
876                                 out << tag << endl;
877                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
878                                 out.close();
879                                 
880                                 //print out labels
881                                 ofstream outLabels;
882                                 filename = toString(getpid()) + ".temp.labels";
883                                 m->openOutputFile(filename, outLabels);
884                                 
885                                 outLabels << cutoff << endl;
886                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
887                                         outLabels << (*it) << endl;
888                                 }
889                                 outLabels.close();
890
891                                 exit(0);
892                         }else { 
893                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
894                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
895                                 exit(0);
896                         }
897                 }
898                 
899                 //force parent to wait until all the processes are done
900                 for (int i=0;i<processors;i++) { 
901                         int temp = processIDS[i];
902                         wait(&temp);
903                 }
904                 
905                 return exitCommand;
906         #endif          
907         
908         }
909         catch(exception& e) {
910                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
911                 exit(1);
912         }
913 }
914 //**********************************************************************************************************************
915
916 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
917         try {
918                 Cluster* cluster;
919                 SparseMatrix* matrix;
920                 ListVector* list;
921                 ListVector oldList;
922                 RAbundVector* rabund;
923                 
924                 vector<string> listFileNames;
925                 
926                 double smallestCutoff = cutoff;
927                 
928                 //cluster each distance file
929                 for (int i = 0; i < distNames.size(); i++) {
930                         if (m->control_pressed) { return listFileNames; }
931                         
932                         string thisNamefile = distNames[i].begin()->second;
933                         string thisDistFile = distNames[i].begin()->first;
934                         
935                         //read in distance file
936                         globaldata->setNameFile(thisNamefile);
937                         globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
938                         
939                         #ifdef USE_MPI
940                                 int pid;
941                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
942                                 
943                                 //output your files too
944                                 if (pid != 0) {
945                                         cout << endl << "Reading " << thisDistFile << endl;
946                                 }
947                         #endif
948                         
949                         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
950                         
951                         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
952                         read->setCutoff(cutoff);
953
954                         NameAssignment* nameMap = new NameAssignment(thisNamefile);
955                         nameMap->readMap();
956                         read->read(nameMap);
957                         
958                         if (m->control_pressed) {  delete read; delete nameMap; return listFileNames; }
959                         
960                         list = read->getListVector();
961                         oldList = *list;
962                         matrix = read->getMatrix();
963                         
964                         delete read; 
965                         delete nameMap; 
966                         
967                         
968                         #ifdef USE_MPI
969                                 //output your files too
970                                 if (pid != 0) {
971                                         cout << endl << "Clustering " << thisDistFile << endl;
972                                 }
973                         #endif
974                         
975                         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
976                 
977                         rabund = new RAbundVector(list->getRAbundVector());
978                         
979                         //create cluster
980                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
981                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
982                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
983                         tag = cluster->getTag();
984                 
985                         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
986                         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
987                         
988                         ofstream listFile;
989                         m->openOutputFile(fileroot+ tag + ".list",      listFile);
990                 
991                         listFileNames.push_back(fileroot+ tag + ".list");
992                                 
993                         float previousDist = 0.00000;
994                         float rndPreviousDist = 0.00000;
995                         
996                         oldList = *list;
997
998                         print_start = true;
999                         start = time(NULL);
1000                         double saveCutoff = cutoff;
1001                 
1002                         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1003                 
1004                                 if (m->control_pressed) { //clean up
1005                                         delete matrix; delete list;     delete cluster; delete rabund;
1006                                         listFile.close();
1007                                         for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
1008                                         listFileNames.clear(); return listFileNames;
1009                                 }
1010                 
1011                                 cluster->update(saveCutoff);
1012         
1013                                 float dist = matrix->getSmallDist();
1014                                 float rndDist;
1015                                 if (hard) {
1016                                         rndDist = m->ceilDist(dist, precision); 
1017                                 }else{
1018                                         rndDist = m->roundDist(dist, precision); 
1019                                 }
1020
1021                                 if(previousDist <= 0.0000 && dist != previousDist){
1022                                         oldList.setLabel("unique");
1023                                         oldList.print(listFile);
1024                                         if (labels.count("unique") == 0) {  labels.insert("unique");  }
1025                                 }
1026                                 else if(rndDist != rndPreviousDist){
1027                                         oldList.setLabel(toString(rndPreviousDist,  length-1));
1028                                         oldList.print(listFile);
1029                                         if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1030                                 }
1031                 
1032                                 previousDist = dist;
1033                                 rndPreviousDist = rndDist;
1034                                 oldList = *list;
1035                         }
1036
1037                 
1038                         if(previousDist <= 0.0000){
1039                                 oldList.setLabel("unique");
1040                                 oldList.print(listFile);
1041                                 if (labels.count("unique") == 0) { labels.insert("unique"); }
1042                         }
1043                         else if(rndPreviousDist<cutoff){
1044                                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1045                                 oldList.print(listFile);
1046                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1047                         }
1048         
1049                         delete matrix; delete list;     delete cluster; delete rabund; 
1050                         listFile.close();
1051                         
1052                         if (m->control_pressed) { //clean up
1053                                 for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
1054                                 listFileNames.clear(); return listFileNames;
1055                         }
1056                         
1057                         remove(thisDistFile.c_str());
1058                         remove(thisNamefile.c_str());
1059                         
1060                         if (saveCutoff != cutoff) { 
1061                                 if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1062                                 else            {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1063                         
1064                                 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  
1065                         }
1066                         
1067                         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1068                 }
1069                 
1070                 cutoff = smallestCutoff;
1071                                         
1072                 return listFileNames;
1073         
1074         }
1075         catch(exception& e) {
1076                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1077                 exit(1);
1078         }
1079
1080
1081 }
1082 //**********************************************************************************************************************
1083
1084 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1085         try{
1086                 
1087 #ifdef USE_MPI
1088                 int pid;
1089                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1090                 
1091                 if (pid != 0) {
1092 #endif
1093                 
1094                 string thisOutputDir = outputDir;
1095                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1096                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
1097                 remove(outputFileName.c_str());
1098                 
1099                 
1100                 for (int i = 0; i < distNames.size(); i++) {
1101                         if (m->control_pressed) {  return 0; }
1102                         
1103                         string thisDistFile = distNames[i].begin()->first;
1104                         
1105                         m->appendFiles(thisDistFile, outputFileName);
1106                 }       
1107                         
1108                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1109                         
1110 #ifdef USE_MPI
1111                 }
1112 #endif
1113                                 
1114                 return 0;       
1115                 
1116                 
1117         }
1118         catch(exception& e) {
1119                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1120                 exit(1);
1121         }
1122 }
1123 //**********************************************************************************************************************