]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
added [ERROR] flag if command aborts
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
17
18
19 //**********************************************************************************************************************
20 vector<string> ClusterSplitCommand::getValidParameters(){       
21         try {
22                 string AlignArray[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
23                 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
24                 return myArray;
25         }
26         catch(exception& e) {
27                 m->errorOut(e, "ClusterSplitCommand", "getValidParameters");
28                 exit(1);
29         }
30 }
31 //**********************************************************************************************************************
32 ClusterSplitCommand::ClusterSplitCommand(){     
33         try {
34                 abort = true; calledHelp = true; 
35                 vector<string> tempOutNames;
36                 outputTypes["list"] = tempOutNames;
37                 outputTypes["rabund"] = tempOutNames;
38                 outputTypes["sabund"] = tempOutNames;
39                 outputTypes["column"] = tempOutNames;
40         }
41         catch(exception& e) {
42                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
43                 exit(1);
44         }
45 }
46 //**********************************************************************************************************************
47 vector<string> ClusterSplitCommand::getRequiredParameters(){    
48         try {
49                 string Array[] =  {"fasta","phylip","column","or"};
50                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
51                 return myArray;
52         }
53         catch(exception& e) {
54                 m->errorOut(e, "ClusterSplitCommand", "getRequiredParameters");
55                 exit(1);
56         }
57 }
58 //**********************************************************************************************************************
59 vector<string> ClusterSplitCommand::getRequiredFiles(){ 
60         try {
61                 vector<string> myArray;
62                 return myArray;
63         }
64         catch(exception& e) {
65                 m->errorOut(e, "ClusterSplitCommand", "getRequiredFiles");
66                 exit(1);
67         }
68 }
69 //**********************************************************************************************************************
70 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
71 ClusterSplitCommand::ClusterSplitCommand(string option)  {
72         try{
73                 globaldata = GlobalData::getInstance();
74                 abort = false; calledHelp = false;   
75                 format = "";
76                 
77                 //allow user to run help
78                 if(option == "help") { help(); abort = true; calledHelp = true; }
79                 
80                 else {
81                         //valid paramters for this command
82                         string Array[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
83                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
84                         
85                         OptionParser parser(option);
86                         map<string,string> parameters = parser.getParameters();
87                         
88                         ValidParameters validParameter("cluster.split");
89                 
90                         //check to make sure all parameters are valid for command
91                         map<string,string>::iterator it;
92                         for (it = parameters.begin(); it != parameters.end(); it++) { 
93                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
94                                         abort = true;
95                                 }
96                         }
97                         
98                         //initialize outputTypes
99                         vector<string> tempOutNames;
100                         outputTypes["list"] = tempOutNames;
101                         outputTypes["rabund"] = tempOutNames;
102                         outputTypes["sabund"] = tempOutNames;
103                         outputTypes["column"] = tempOutNames;
104                         
105                         globaldata->newRead();
106                         
107                         //if the user changes the output directory command factory will send this info to us in the output parameter 
108                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
109                         
110                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
111                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
112                         if (inputDir == "not found"){   inputDir = "";          }
113                         else {
114                                 string path;
115                                 it = parameters.find("phylip");
116                                 //user has given a template file
117                                 if(it != parameters.end()){ 
118                                         path = m->hasPath(it->second);
119                                         //if the user has not given a path then, add inputdir. else leave path alone.
120                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
121                                 }
122                                 
123                                 it = parameters.find("column");
124                                 //user has given a template file
125                                 if(it != parameters.end()){ 
126                                         path = m->hasPath(it->second);
127                                         //if the user has not given a path then, add inputdir. else leave path alone.
128                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
129                                 }
130                                 
131                                 it = parameters.find("name");
132                                 //user has given a template file
133                                 if(it != parameters.end()){ 
134                                         path = m->hasPath(it->second);
135                                         //if the user has not given a path then, add inputdir. else leave path alone.
136                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
137                                 }
138                                 
139                                 it = parameters.find("taxonomy");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
145                                 }
146                                 
147                                 it = parameters.find("fasta");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
153                                 }
154                         }
155                         
156                         //check for required parameters
157                         phylipfile = validParameter.validFile(parameters, "phylip", true);
158                         if (phylipfile == "not open") { abort = true; }
159                         else if (phylipfile == "not found") { phylipfile = ""; }        
160                         else {  distfile = phylipfile;  format = "phylip";      }
161                         
162                         columnfile = validParameter.validFile(parameters, "column", true);
163                         if (columnfile == "not open") { abort = true; } 
164                         else if (columnfile == "not found") { columnfile = ""; }
165                         else {  distfile = columnfile; format = "column";       }
166                         
167                         namefile = validParameter.validFile(parameters, "name", true);
168                         if (namefile == "not open") { abort = true; }   
169                         else if (namefile == "not found") { namefile = ""; }
170                         
171                         fastafile = validParameter.validFile(parameters, "fasta", true);
172                         if (fastafile == "not open") { abort = true; }  
173                         else if (fastafile == "not found") { fastafile = ""; }
174                         else { distfile = fastafile;  splitmethod = "fasta";  }
175                         
176                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
177                         if (taxFile == "not open") { abort = true; }    
178                         else if (taxFile == "not found") { taxFile = ""; }
179                         
180                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
181                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
182                 
183                         if (columnfile != "") {
184                                 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
185                         }
186                         
187                         if (fastafile != "") {
188                                 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
189                                 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
190                         }
191                                         
192                         //check for optional parameter and set defaults
193                         // ...at some point should added some additional type checking...
194                         //get user cutoff and precision or use defaults
195                         string temp;
196                         temp = validParameter.validFile(parameters, "precision", false);
197                         if (temp == "not found") { temp = "100"; }
198                         //saves precision legnth for formatting below
199                         length = temp.length();
200                         convert(temp, precision); 
201                         
202                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
203                         hard = m->isTrue(temp);
204                         
205                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
206                         large = m->isTrue(temp);
207                         
208                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
209                         convert(temp, processors); 
210                         
211                         temp = validParameter.validFile(parameters, "splitmethod", false);      
212                         if (splitmethod != "fasta") {
213                                 if (temp == "not found")  { splitmethod = "distance"; }
214                                 else {  splitmethod = temp; }
215                         }
216                         
217                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
218                         convert(temp, cutoff); 
219                         cutoff += (5 / (precision * 10.0));  
220                         
221                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
222                         convert(temp, taxLevelCutoff); 
223                         
224                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
225                         
226                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
227                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
228                         
229                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
230                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
231                         
232                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
233
234                         showabund = validParameter.validFile(parameters, "showabund", false);
235                         if (showabund == "not found") { showabund = "T"; }
236
237                         timing = validParameter.validFile(parameters, "timing", false);
238                         if (timing == "not found") { timing = "F"; }
239                         
240                 }
241         }
242         catch(exception& e) {
243                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
244                 exit(1);
245         }
246 }
247
248 //**********************************************************************************************************************
249
250 void ClusterSplitCommand::help(){
251         try {
252                 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
253                 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
254                 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
255                 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n");
256                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n");
257                 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n");
258                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
259                 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
260                 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
261                 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
262                 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
263                 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
264                 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
265                 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
266                 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
267                 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n");
268                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
269                 #ifdef USE_MPI
270                 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
271                 #endif
272                 m->mothurOut("The cluster.split command should be in the following format: \n");
273                 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
274                 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");       
275
276         }
277         catch(exception& e) {
278                 m->errorOut(e, "ClusterSplitCommand", "help");
279                 exit(1);
280         }
281 }
282
283 //**********************************************************************************************************************
284
285 ClusterSplitCommand::~ClusterSplitCommand(){}
286
287 //**********************************************************************************************************************
288
289 int ClusterSplitCommand::execute(){
290         try {
291         
292                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
293                 
294                 time_t estart;
295                 vector<string> listFileNames;
296                 set<string> labels;
297                 string singletonName = "";
298                 double saveCutoff = cutoff;
299
300                 //****************** file prep work ******************************//
301                 #ifdef USE_MPI
302                         int pid;
303                         int tag = 2001;
304                         MPI_Status status; 
305                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
306                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
307                         
308                         if (pid == 0) { //only process 0 converts and splits
309                         
310                 #endif
311                 
312                 //if user gave a phylip file convert to column file
313                 if (format == "phylip") {
314                         estart = time(NULL);
315                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
316                         
317                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
318                         
319                         NameAssignment* nameMap = NULL;
320                         convert->setFormat("phylip");
321                         convert->read(nameMap);
322                         
323                         if (m->control_pressed) {  delete convert;  return 0;  }
324                         
325                         distfile = convert->getOutputFile();
326                 
327                         //if no names file given with phylip file, create it
328                         ListVector* listToMakeNameFile =  convert->getListVector();
329                         if (namefile == "") {  //you need to make a namefile for split matrix
330                                 ofstream out;
331                                 namefile = phylipfile + ".names";
332                                 m->openOutputFile(namefile, out);
333                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
334                                         string bin = listToMakeNameFile->get(i);
335                                         out << bin << '\t' << bin << endl;
336                                 }
337                                 out.close();
338                         }
339                         delete listToMakeNameFile;
340                         delete convert;
341                         
342                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
343                 }
344                 if (m->control_pressed) { return 0; }
345                 
346                 estart = time(NULL);
347                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
348                 
349                 //split matrix into non-overlapping groups
350                 SplitMatrix* split;
351                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
352                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
353                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir);      }
354                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
355                 
356                 split->split();
357                 
358                 if (m->control_pressed) { delete split; return 0; }
359                 
360                 singletonName = split->getSingletonNames();
361                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
362                 delete split;
363                 
364                 //output a merged distance file
365                 if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
366                         
367                                 
368                 if (m->control_pressed) { return 0; }
369                 
370                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
371                 estart = time(NULL);
372                 
373                 //****************** break up files between processes and cluster each file set ******************************//
374         #ifdef USE_MPI
375                         ////you are process 0 from above////
376                         
377                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...                           
378                         dividedNames.resize(processors);
379                                         
380                         //for each file group figure out which process will complete it
381                         //want to divide the load intelligently so the big files are spread between processes
382                         for (int i = 0; i < distName.size(); i++) { 
383                                 int processToAssign = (i+1) % processors; 
384                                 if (processToAssign == 0) { processToAssign = processors; }
385                                                 
386                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
387                         }
388                                         
389                         //not lets reverse the order of ever other process, so we balance big files running with little ones
390                         for (int i = 0; i < processors; i++) {
391                                 int remainder = ((i+1) % processors);
392                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
393                         }
394                         
395                         
396                         //send each child the list of files it needs to process
397                         for(int i = 1; i < processors; i++) { 
398                                 //send number of file pairs
399                                 int num = dividedNames[i].size();
400                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
401                                 
402                                 for (int j = 0; j < num; j++) { //send filenames to process i
403                                         char tempDistFileName[1024];
404                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
405                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
406                                         
407                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
408                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
409                                         
410                                         char tempNameFileName[1024];
411                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
412                                         int lengthName = (dividedNames[i][j].begin()->second).length();
413
414                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
415                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
416                                 }
417                         }
418                         
419                         //process your share
420                         listFileNames = cluster(dividedNames[0], labels);
421                         
422                         //receive the other processes info
423                         for(int i = 1; i < processors; i++) { 
424                                 int num = dividedNames[i].size();
425                                 
426                                 double tempCutoff;
427                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
428                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
429                                 
430                                 //send list filenames to root process
431                                 for (int j = 0; j < num; j++) {  
432                                         int lengthList = 0;
433                                         char tempListFileName[1024];
434                                 
435                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
436                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
437                                 
438                                         string myListFileName = tempListFileName;
439                                         myListFileName = myListFileName.substr(0, lengthList);
440                                         
441                                         listFileNames.push_back(myListFileName);
442                                 }
443                                 
444                                 //send Labels to root process
445                                 int numLabels = 0;
446                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
447                                 
448                                 for (int j = 0; j < numLabels; j++) {  
449                                         int lengthLabel = 0;
450                                         char tempLabel[100];
451                                 
452                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
453                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
454                                 
455                                         string myLabel = tempLabel;
456                                         myLabel = myLabel.substr(0, lengthLabel);
457                         
458                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
459                                 }
460                         }
461                         
462                 }else { //you are a child process
463                         vector < map<string, string> >  myNames;
464                         
465                         //recieve the files you need to process
466                         //receive number of file pairs
467                         int num = 0;
468                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
469                         
470                         myNames.resize(num);
471         
472                         for (int j = 0; j < num; j++) { //receive filenames to process 
473                                 int lengthDist = 0;
474                                 char tempDistFileName[1024];
475                                 
476                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
477                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
478                                 
479                                 string myDistFileName = tempDistFileName;
480                                 myDistFileName = myDistFileName.substr(0, lengthDist);
481                         
482                                 int lengthName = 0;
483                                 char tempNameFileName[1024];
484                                 
485                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
486                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
487                                 
488                                 string myNameFileName = tempNameFileName;
489                                 myNameFileName = myNameFileName.substr(0, lengthName);
490                                 
491                                 //save file name
492                                 myNames[j][myDistFileName] = myNameFileName;
493                         }
494         
495                         //process them
496                         listFileNames = cluster(myNames, labels);
497                         
498                         //send cutoff
499                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
500                         
501                         //send list filenames to root process
502                         for (int j = 0; j < num; j++) {  
503                                 char tempListFileName[1024];
504                                 strcpy(tempListFileName, listFileNames[j].c_str());
505                                 int lengthList = listFileNames[j].length();
506                                         
507                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
508                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
509                         }
510                         
511                         //send Labels to root process
512                         int numLabels = labels.size();
513                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
514                         
515                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
516                                 char tempLabel[100];
517                                 strcpy(tempLabel, (*it).c_str());
518                                 int lengthLabel = (*it).length();
519                                         
520                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
521                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
522                         }
523                 }
524                 
525                 //make everyone wait
526                 MPI_Barrier(MPI_COMM_WORLD);
527                 
528         #else
529
530                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
531                                 if(processors == 1){
532                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
533                                 }else{
534                                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
535                                         dividedNames.resize(processors);
536                                         
537                                         //for each file group figure out which process will complete it
538                                         //want to divide the load intelligently so the big files are spread between processes
539                                         for (int i = 0; i < distName.size(); i++) { 
540                                                 int processToAssign = (i+1) % processors; 
541                                                 if (processToAssign == 0) { processToAssign = processors; }
542                                                 
543                                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
544                                         }
545                                         
546                                         //not lets reverse the order of ever other process, so we balance big files running with little ones
547                                         for (int i = 0; i < processors; i++) {
548                                                 int remainder = ((i+1) % processors);
549                                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
550                                         }
551                                         
552                                         createProcesses(dividedNames);
553                                                         
554                                         if (m->control_pressed) { return 0; }
555
556                                         //get list of list file names from each process
557                                         for(int i=0;i<processors;i++){
558                                                 string filename = toString(processIDS[i]) + ".temp";
559                                                 ifstream in;
560                                                 m->openInputFile(filename, in);
561                                                 
562                                                 in >> tag; m->gobble(in);
563                                                 
564                                                 while(!in.eof()) {
565                                                         string tempName;
566                                                         in >> tempName; m->gobble(in);
567                                                         listFileNames.push_back(tempName);
568                                                 }
569                                                 in.close();
570                                                 remove((toString(processIDS[i]) + ".temp").c_str());
571                                                 
572                                                 //get labels
573                                                 filename = toString(processIDS[i]) + ".temp.labels";
574                                                 ifstream in2;
575                                                 m->openInputFile(filename, in2);
576                                                 
577                                                 float tempCutoff;
578                                                 in2 >> tempCutoff; m->gobble(in2);
579                                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
580                                                 
581                                                 while(!in2.eof()) {
582                                                         string tempName;
583                                                         in2 >> tempName; m->gobble(in2);
584                                                         if (labels.count(tempName) == 0) { labels.insert(tempName); }
585                                                 }
586                                                 in2.close();
587                                                 remove((toString(processIDS[i]) + ".temp.labels").c_str());
588                                         }
589                                 }
590                 #else
591                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
592                 #endif
593         #endif  
594                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
595                 
596                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
597                 
598                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
599                 
600                 //****************** merge list file and create rabund and sabund files ******************************//
601                 estart = time(NULL);
602                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
603                 
604                 #ifdef USE_MPI
605                         if (pid == 0) { //only process 0 merges
606                 #endif
607
608                 ListVector* listSingle;
609                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
610                 
611                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
612                 
613                 mergeLists(listFileNames, labelBins, listSingle);
614
615                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
616                 
617                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
618                 
619                 m->mothurOutEndLine();
620                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
621                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
622                 m->mothurOutEndLine();
623                 
624                 #ifdef USE_MPI
625                         } //only process 0 merges
626                         
627                         //make everyone wait
628                         MPI_Barrier(MPI_COMM_WORLD);
629                 #endif
630
631                 return 0;
632         }
633         catch(exception& e) {
634                 m->errorOut(e, "ClusterSplitCommand", "execute");
635                 exit(1);
636         }
637 }
638 //**********************************************************************************************************************
639 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
640         try {
641                                 
642                 map<float, int> labelBin;
643                 vector<float> orderFloat;
644                 int numSingleBins;
645                 
646                 //read in singletons
647                 if (singleton != "none") {
648                         ifstream in;
649                         m->openInputFile(singleton, in);
650                                 
651                         string firstCol, secondCol;
652                         listSingle = new ListVector();
653                         while (!in.eof()) {
654                                 in >> firstCol >> secondCol; m->gobble(in);
655                                 listSingle->push_back(secondCol);
656                         }
657                         in.close();
658                         remove(singleton.c_str());
659                         
660                         numSingleBins = listSingle->getNumBins();
661                 }else{  listSingle = NULL; numSingleBins = 0;  }
662                 
663                 //go through users set and make them floats so we can sort them 
664                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
665                         float temp = -10.0;
666
667                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
668                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
669                         
670                         if (temp <= cutoff) {
671                                 orderFloat.push_back(temp);
672                                 labelBin[temp] = numSingleBins; //initialize numbins 
673                         }
674                 }
675         
676                 //sort order
677                 sort(orderFloat.begin(), orderFloat.end());
678                 userLabels.clear();
679                         
680                 //get the list info from each file
681                 for (int k = 0; k < listNames.size(); k++) {
682         
683                         if (m->control_pressed) {  
684                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
685                                 for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
686                                 return labelBin;
687                         }
688                         
689                         InputData* input = new InputData(listNames[k], "list");
690                         ListVector* list = input->getListVector();
691                         string lastLabel = list->getLabel();
692                         
693                         string filledInList = listNames[k] + "filledInTemp";
694                         ofstream outFilled;
695                         m->openOutputFile(filledInList, outFilled);
696         
697                         //for each label needed
698                         for(int l = 0; l < orderFloat.size(); l++){
699                         
700                                 string thisLabel;
701                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
702                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
703
704                                 //this file has reached the end
705                                 if (list == NULL) { 
706                                         list = input->getListVector(lastLabel, true); 
707                                 }else{  //do you have the distance, or do you need to fill in
708                                                 
709                                         float labelFloat;
710                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
711                                         else { convert(list->getLabel(), labelFloat); }
712
713                                         //check for missing labels
714                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
715                                                 //if its bigger get last label, otherwise keep it
716                                                 delete list;
717                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
718                                         }
719                                         lastLabel = list->getLabel();
720                                 }
721                                 
722                                 //print to new file
723                                 list->setLabel(thisLabel);
724                                 list->print(outFilled);
725                 
726                                 //update labelBin
727                                 labelBin[orderFloat[l]] += list->getNumBins();
728                                                                         
729                                 delete list;
730                                                                         
731                                 list = input->getListVector();
732                         }
733                         
734                         if (list != NULL) { delete list; }
735                         delete input;
736                         
737                         outFilled.close();
738                         remove(listNames[k].c_str());
739                         rename(filledInList.c_str(), listNames[k].c_str());
740                 }
741                 
742                 return labelBin;
743         }
744         catch(exception& e) {
745                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
746                 exit(1);
747         }
748 }
749 //**********************************************************************************************************************
750 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
751         try {
752                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
753                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
754                 
755                 m->openOutputFile(fileroot+ tag + ".sabund",    outSabund);
756                 m->openOutputFile(fileroot+ tag + ".rabund",    outRabund);
757                 m->openOutputFile(fileroot+ tag + ".list",              outList);
758                                 
759                 outputNames.push_back(fileroot+ tag + ".sabund");  outputTypes["list"].push_back(fileroot+ tag + ".list");
760                 outputNames.push_back(fileroot+ tag + ".rabund");  outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
761                 outputNames.push_back(fileroot+ tag + ".list");    outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
762                 
763                 map<float, int>::iterator itLabel;
764
765                 //for each label needed
766                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
767                         
768                         string thisLabel;
769                         if (itLabel->first == -1) { thisLabel = "unique"; }
770                         else { thisLabel = toString(itLabel->first,  length-1);  } 
771                         
772                         outList << thisLabel << '\t' << itLabel->second << '\t';
773
774                         RAbundVector* rabund = new RAbundVector();
775                         rabund->setLabel(thisLabel);
776
777                         //add in singletons
778                         if (listSingle != NULL) {
779                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
780                                         outList << listSingle->get(j) << '\t';
781                                         rabund->push_back(m->getNumNames(listSingle->get(j)));
782                                 }
783                         }
784                         
785                         //get the list info from each file
786                         for (int k = 0; k < listNames.size(); k++) {
787         
788                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
789                                 
790                                 InputData* input = new InputData(listNames[k], "list");
791                                 ListVector* list = input->getListVector(thisLabel);
792                                 
793                                 //this file has reached the end
794                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
795                                 else {          
796                                         for (int j = 0; j < list->getNumBins(); j++) {
797                                                 outList << list->get(j) << '\t';
798                                                 rabund->push_back(m->getNumNames(list->get(j)));
799                                         }
800                                         delete list;
801                                 }
802                                 delete input;
803                         }
804                         
805                         SAbundVector sabund = rabund->getSAbundVector();
806                         
807                         sabund.print(outSabund);
808                         rabund->print(outRabund);
809                         outList << endl;
810                         
811                         delete rabund;
812                 }
813                 
814                 outList.close();
815                 outRabund.close();
816                 outSabund.close();
817                 
818                 if (listSingle != NULL) { delete listSingle;  }
819                 
820                 for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
821                 
822                 return 0;
823         }
824         catch(exception& e) {
825                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
826                 exit(1);
827         }
828 }
829
830 //**********************************************************************************************************************
831
832 void ClusterSplitCommand::printData(ListVector* oldList){
833         try {
834                 string label = oldList->getLabel();
835                 RAbundVector oldRAbund = oldList->getRAbundVector();
836                 
837                 oldRAbund.setLabel(label);
838                 if (m->isTrue(showabund)) {
839                         oldRAbund.getSAbundVector().print(cout);
840                 }
841                 oldRAbund.print(outRabund);
842                 oldRAbund.getSAbundVector().print(outSabund);
843         
844                 oldList->print(outList);
845         }
846         catch(exception& e) {
847                 m->errorOut(e, "ClusterSplitCommand", "printData");
848                 exit(1);
849         }
850 }
851 //**********************************************************************************************************************
852 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
853         try {
854         
855         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
856                 int process = 0;
857                 int exitCommand = 1;
858                 processIDS.clear();
859                 
860                 //loop through and create all the processes you want
861                 while (process != processors) {
862                         int pid = fork();
863                         
864                         if (pid > 0) {
865                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
866                                 process++;
867                         }else if (pid == 0){
868                                 set<string> labels;
869                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
870                                 
871                                 //write out names to file
872                                 string filename = toString(getpid()) + ".temp";
873                                 ofstream out;
874                                 m->openOutputFile(filename, out);
875                                 out << tag << endl;
876                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
877                                 out.close();
878                                 
879                                 //print out labels
880                                 ofstream outLabels;
881                                 filename = toString(getpid()) + ".temp.labels";
882                                 m->openOutputFile(filename, outLabels);
883                                 
884                                 outLabels << cutoff << endl;
885                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
886                                         outLabels << (*it) << endl;
887                                 }
888                                 outLabels.close();
889
890                                 exit(0);
891                         }else { 
892                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
893                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
894                                 exit(0);
895                         }
896                 }
897                 
898                 //force parent to wait until all the processes are done
899                 for (int i=0;i<processors;i++) { 
900                         int temp = processIDS[i];
901                         wait(&temp);
902                 }
903                 
904                 return exitCommand;
905         #endif          
906         
907         }
908         catch(exception& e) {
909                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
910                 exit(1);
911         }
912 }
913 //**********************************************************************************************************************
914
915 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
916         try {
917                 Cluster* cluster;
918                 SparseMatrix* matrix;
919                 ListVector* list;
920                 ListVector oldList;
921                 RAbundVector* rabund;
922                 
923                 vector<string> listFileNames;
924                 
925                 double smallestCutoff = cutoff;
926                 
927                 //cluster each distance file
928                 for (int i = 0; i < distNames.size(); i++) {
929                         if (m->control_pressed) { return listFileNames; }
930                         
931                         string thisNamefile = distNames[i].begin()->second;
932                         string thisDistFile = distNames[i].begin()->first;
933                         
934                         //read in distance file
935                         globaldata->setNameFile(thisNamefile);
936                         globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
937                         
938                         #ifdef USE_MPI
939                                 int pid;
940                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
941                                 
942                                 //output your files too
943                                 if (pid != 0) {
944                                         cout << endl << "Reading " << thisDistFile << endl;
945                                 }
946                         #endif
947                         
948                         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
949                         
950                         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
951                         read->setCutoff(cutoff);
952
953                         NameAssignment* nameMap = new NameAssignment(thisNamefile);
954                         nameMap->readMap();
955                         read->read(nameMap);
956                         
957                         if (m->control_pressed) {  delete read; delete nameMap; return listFileNames; }
958                         
959                         list = read->getListVector();
960                         oldList = *list;
961                         matrix = read->getMatrix();
962                         
963                         delete read; 
964                         delete nameMap; 
965                         
966                         
967                         #ifdef USE_MPI
968                                 //output your files too
969                                 if (pid != 0) {
970                                         cout << endl << "Clustering " << thisDistFile << endl;
971                                 }
972                         #endif
973                         
974                         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
975                 
976                         rabund = new RAbundVector(list->getRAbundVector());
977                         
978                         //create cluster
979                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
980                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
981                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
982                         tag = cluster->getTag();
983                 
984                         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
985                         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
986                         
987                         ofstream listFile;
988                         m->openOutputFile(fileroot+ tag + ".list",      listFile);
989                 
990                         listFileNames.push_back(fileroot+ tag + ".list");
991                                 
992                         float previousDist = 0.00000;
993                         float rndPreviousDist = 0.00000;
994                         
995                         oldList = *list;
996
997                         print_start = true;
998                         start = time(NULL);
999                         double saveCutoff = cutoff;
1000                 
1001                         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1002                 
1003                                 if (m->control_pressed) { //clean up
1004                                         delete matrix; delete list;     delete cluster; delete rabund;
1005                                         listFile.close();
1006                                         for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
1007                                         listFileNames.clear(); return listFileNames;
1008                                 }
1009                 
1010                                 cluster->update(saveCutoff);
1011         
1012                                 float dist = matrix->getSmallDist();
1013                                 float rndDist;
1014                                 if (hard) {
1015                                         rndDist = m->ceilDist(dist, precision); 
1016                                 }else{
1017                                         rndDist = m->roundDist(dist, precision); 
1018                                 }
1019
1020                                 if(previousDist <= 0.0000 && dist != previousDist){
1021                                         oldList.setLabel("unique");
1022                                         oldList.print(listFile);
1023                                         if (labels.count("unique") == 0) {  labels.insert("unique");  }
1024                                 }
1025                                 else if(rndDist != rndPreviousDist){
1026                                         oldList.setLabel(toString(rndPreviousDist,  length-1));
1027                                         oldList.print(listFile);
1028                                         if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1029                                 }
1030                 
1031                                 previousDist = dist;
1032                                 rndPreviousDist = rndDist;
1033                                 oldList = *list;
1034                         }
1035
1036                 
1037                         if(previousDist <= 0.0000){
1038                                 oldList.setLabel("unique");
1039                                 oldList.print(listFile);
1040                                 if (labels.count("unique") == 0) { labels.insert("unique"); }
1041                         }
1042                         else if(rndPreviousDist<cutoff){
1043                                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1044                                 oldList.print(listFile);
1045                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1046                         }
1047         
1048                         delete matrix; delete list;     delete cluster; delete rabund; 
1049                         listFile.close();
1050                         
1051                         if (m->control_pressed) { //clean up
1052                                 for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
1053                                 listFileNames.clear(); return listFileNames;
1054                         }
1055                         
1056                         remove(thisDistFile.c_str());
1057                         remove(thisNamefile.c_str());
1058                         
1059                         if (saveCutoff != cutoff) { 
1060                                 if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1061                                 else            {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1062                         
1063                                 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  
1064                         }
1065                         
1066                         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1067                 }
1068                 
1069                 cutoff = smallestCutoff;
1070                                         
1071                 return listFileNames;
1072         
1073         }
1074         catch(exception& e) {
1075                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1076                 exit(1);
1077         }
1078
1079
1080 }
1081 //**********************************************************************************************************************
1082
1083 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1084         try{
1085                 
1086 #ifdef USE_MPI
1087                 int pid;
1088                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1089                 
1090                 if (pid != 0) {
1091 #endif
1092                 
1093                 string thisOutputDir = outputDir;
1094                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1095                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
1096                 remove(outputFileName.c_str());
1097                 
1098                 
1099                 for (int i = 0; i < distNames.size(); i++) {
1100                         if (m->control_pressed) {  return 0; }
1101                         
1102                         string thisDistFile = distNames[i].begin()->first;
1103                         
1104                         m->appendFiles(thisDistFile, outputFileName);
1105                 }       
1106                         
1107                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1108                         
1109 #ifdef USE_MPI
1110                 }
1111 #endif
1112                                 
1113                 return 0;       
1114                 
1115                 
1116         }
1117         catch(exception& e) {
1118                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1119                 exit(1);
1120         }
1121 }
1122 //**********************************************************************************************************************