]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
removed various build warnings
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
17
18
19 //**********************************************************************************************************************
20 vector<string> ClusterSplitCommand::getValidParameters(){       
21         try {
22                 string AlignArray[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
23                 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
24                 return myArray;
25         }
26         catch(exception& e) {
27                 m->errorOut(e, "ClusterSplitCommand", "getValidParameters");
28                 exit(1);
29         }
30 }
31 //**********************************************************************************************************************
32 ClusterSplitCommand::ClusterSplitCommand(){     
33         try {
34                 abort = true;
35                 //initialize outputTypes
36                 vector<string> tempOutNames;
37                 outputTypes["list"] = tempOutNames;
38                 outputTypes["rabund"] = tempOutNames;
39                 outputTypes["sabund"] = tempOutNames;
40                 outputTypes["column"] = tempOutNames;
41         }
42         catch(exception& e) {
43                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
44                 exit(1);
45         }
46 }
47 //**********************************************************************************************************************
48 vector<string> ClusterSplitCommand::getRequiredParameters(){    
49         try {
50                 string Array[] =  {"fasta","phylip","column","or"};
51                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
52                 return myArray;
53         }
54         catch(exception& e) {
55                 m->errorOut(e, "ClusterSplitCommand", "getRequiredParameters");
56                 exit(1);
57         }
58 }
59 //**********************************************************************************************************************
60 vector<string> ClusterSplitCommand::getRequiredFiles(){ 
61         try {
62                 vector<string> myArray;
63                 return myArray;
64         }
65         catch(exception& e) {
66                 m->errorOut(e, "ClusterSplitCommand", "getRequiredFiles");
67                 exit(1);
68         }
69 }
70 //**********************************************************************************************************************
71 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
72 ClusterSplitCommand::ClusterSplitCommand(string option)  {
73         try{
74                 globaldata = GlobalData::getInstance();
75                 abort = false;
76                 format = "";
77                 
78                 //allow user to run help
79                 if(option == "help") { help(); abort = true; }
80                 
81                 else {
82                         //valid paramters for this command
83                         string Array[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
84                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
85                         
86                         OptionParser parser(option);
87                         map<string,string> parameters = parser.getParameters();
88                         
89                         ValidParameters validParameter("cluster.split");
90                 
91                         //check to make sure all parameters are valid for command
92                         map<string,string>::iterator it;
93                         for (it = parameters.begin(); it != parameters.end(); it++) { 
94                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
95                                         abort = true;
96                                 }
97                         }
98                         
99                         //initialize outputTypes
100                         vector<string> tempOutNames;
101                         outputTypes["list"] = tempOutNames;
102                         outputTypes["rabund"] = tempOutNames;
103                         outputTypes["sabund"] = tempOutNames;
104                         outputTypes["column"] = tempOutNames;
105                         
106                         globaldata->newRead();
107                         
108                         //if the user changes the output directory command factory will send this info to us in the output parameter 
109                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
110                         
111                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
112                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
113                         if (inputDir == "not found"){   inputDir = "";          }
114                         else {
115                                 string path;
116                                 it = parameters.find("phylip");
117                                 //user has given a template file
118                                 if(it != parameters.end()){ 
119                                         path = m->hasPath(it->second);
120                                         //if the user has not given a path then, add inputdir. else leave path alone.
121                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
122                                 }
123                                 
124                                 it = parameters.find("column");
125                                 //user has given a template file
126                                 if(it != parameters.end()){ 
127                                         path = m->hasPath(it->second);
128                                         //if the user has not given a path then, add inputdir. else leave path alone.
129                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
130                                 }
131                                 
132                                 it = parameters.find("name");
133                                 //user has given a template file
134                                 if(it != parameters.end()){ 
135                                         path = m->hasPath(it->second);
136                                         //if the user has not given a path then, add inputdir. else leave path alone.
137                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
138                                 }
139                                 
140                                 it = parameters.find("taxonomy");
141                                 //user has given a template file
142                                 if(it != parameters.end()){ 
143                                         path = m->hasPath(it->second);
144                                         //if the user has not given a path then, add inputdir. else leave path alone.
145                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
146                                 }
147                                 
148                                 it = parameters.find("fasta");
149                                 //user has given a template file
150                                 if(it != parameters.end()){ 
151                                         path = m->hasPath(it->second);
152                                         //if the user has not given a path then, add inputdir. else leave path alone.
153                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
154                                 }
155                         }
156                         
157                         //check for required parameters
158                         phylipfile = validParameter.validFile(parameters, "phylip", true);
159                         if (phylipfile == "not open") { abort = true; }
160                         else if (phylipfile == "not found") { phylipfile = ""; }        
161                         else {  distfile = phylipfile;  format = "phylip";      }
162                         
163                         columnfile = validParameter.validFile(parameters, "column", true);
164                         if (columnfile == "not open") { abort = true; } 
165                         else if (columnfile == "not found") { columnfile = ""; }
166                         else {  distfile = columnfile; format = "column";       }
167                         
168                         namefile = validParameter.validFile(parameters, "name", true);
169                         if (namefile == "not open") { abort = true; }   
170                         else if (namefile == "not found") { namefile = ""; }
171                         
172                         fastafile = validParameter.validFile(parameters, "fasta", true);
173                         if (fastafile == "not open") { abort = true; }  
174                         else if (fastafile == "not found") { fastafile = ""; }
175                         else { distfile = fastafile;  splitmethod = "fasta";  }
176                         
177                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
178                         if (taxFile == "not open") { abort = true; }    
179                         else if (taxFile == "not found") { taxFile = ""; }
180                         
181                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
182                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
183                 
184                         if (columnfile != "") {
185                                 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
186                         }
187                         
188                         if (fastafile != "") {
189                                 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
190                                 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
191                         }
192                                         
193                         //check for optional parameter and set defaults
194                         // ...at some point should added some additional type checking...
195                         //get user cutoff and precision or use defaults
196                         string temp;
197                         temp = validParameter.validFile(parameters, "precision", false);
198                         if (temp == "not found") { temp = "100"; }
199                         //saves precision legnth for formatting below
200                         length = temp.length();
201                         convert(temp, precision); 
202                         
203                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
204                         hard = m->isTrue(temp);
205                         
206                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
207                         large = m->isTrue(temp);
208                         
209                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
210                         convert(temp, processors); 
211                         
212                         temp = validParameter.validFile(parameters, "splitmethod", false);      
213                         if (splitmethod != "fasta") {
214                                 if (temp == "not found")  { splitmethod = "distance"; }
215                                 else {  splitmethod = temp; }
216                         }
217                         
218                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
219                         convert(temp, cutoff); 
220                         cutoff += (5 / (precision * 10.0));  
221                         
222                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
223                         convert(temp, taxLevelCutoff); 
224                         
225                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
226                         
227                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
228                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
229                         
230                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
231                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
232                         
233                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
234
235                         showabund = validParameter.validFile(parameters, "showabund", false);
236                         if (showabund == "not found") { showabund = "T"; }
237
238                         timing = validParameter.validFile(parameters, "timing", false);
239                         if (timing == "not found") { timing = "F"; }
240                         
241                 }
242         }
243         catch(exception& e) {
244                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
245                 exit(1);
246         }
247 }
248
249 //**********************************************************************************************************************
250
251 void ClusterSplitCommand::help(){
252         try {
253                 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
254                 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
255                 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
256                 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n");
257                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n");
258                 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n");
259                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
260                 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
261                 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
262                 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
263                 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
264                 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
265                 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
266                 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
267                 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
268                 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n");
269                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
270                 #ifdef USE_MPI
271                 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
272                 #endif
273                 m->mothurOut("The cluster.split command should be in the following format: \n");
274                 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
275                 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");       
276
277         }
278         catch(exception& e) {
279                 m->errorOut(e, "ClusterSplitCommand", "help");
280                 exit(1);
281         }
282 }
283
284 //**********************************************************************************************************************
285
286 ClusterSplitCommand::~ClusterSplitCommand(){}
287
288 //**********************************************************************************************************************
289
290 int ClusterSplitCommand::execute(){
291         try {
292         
293                 if (abort == true) {    return 0;       }
294                 
295                 time_t estart;
296                 vector<string> listFileNames;
297                 set<string> labels;
298                 string singletonName = "";
299                 double saveCutoff = cutoff;
300
301                 //****************** file prep work ******************************//
302                 #ifdef USE_MPI
303                         int pid;
304                         int tag = 2001;
305                         MPI_Status status; 
306                         MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
307                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
308                         
309                         if (pid == 0) { //only process 0 converts and splits
310                         
311                 #endif
312                 
313                 //if user gave a phylip file convert to column file
314                 if (format == "phylip") {
315                         estart = time(NULL);
316                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
317                         
318                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
319                         
320                         NameAssignment* nameMap = NULL;
321                         convert->setFormat("phylip");
322                         convert->read(nameMap);
323                         
324                         if (m->control_pressed) {  delete convert;  return 0;  }
325                         
326                         distfile = convert->getOutputFile();
327                 
328                         //if no names file given with phylip file, create it
329                         ListVector* listToMakeNameFile =  convert->getListVector();
330                         if (namefile == "") {  //you need to make a namefile for split matrix
331                                 ofstream out;
332                                 namefile = phylipfile + ".names";
333                                 m->openOutputFile(namefile, out);
334                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
335                                         string bin = listToMakeNameFile->get(i);
336                                         out << bin << '\t' << bin << endl;
337                                 }
338                                 out.close();
339                         }
340                         delete listToMakeNameFile;
341                         delete convert;
342                         
343                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
344                 }
345                 if (m->control_pressed) { return 0; }
346                 
347                 estart = time(NULL);
348                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
349                 
350                 //split matrix into non-overlapping groups
351                 SplitMatrix* split;
352                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
353                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
354                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir);      }
355                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
356                 
357                 split->split();
358                 
359                 if (m->control_pressed) { delete split; return 0; }
360                 
361                 singletonName = split->getSingletonNames();
362                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
363                 delete split;
364                 
365                 //output a merged distance file
366                 if (splitmethod == "fasta")             { createMergedDistanceFile(distName); }
367                         
368                                 
369                 if (m->control_pressed) { return 0; }
370                 
371                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
372                 estart = time(NULL);
373                 
374                 //****************** break up files between processes and cluster each file set ******************************//
375         #ifdef USE_MPI
376                         ////you are process 0 from above////
377                         
378                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...                           
379                         dividedNames.resize(processors);
380                                         
381                         //for each file group figure out which process will complete it
382                         //want to divide the load intelligently so the big files are spread between processes
383                         int count = 1;
384                         for (int i = 0; i < distName.size(); i++) { 
385                                 int processToAssign = (i+1) % processors; 
386                                 if (processToAssign == 0) { processToAssign = processors; }
387                                                 
388                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
389                         }
390                                         
391                         //not lets reverse the order of ever other process, so we balance big files running with little ones
392                         for (int i = 0; i < processors; i++) {
393                                 int remainder = ((i+1) % processors);
394                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
395                         }
396                         
397                         
398                         //send each child the list of files it needs to process
399                         for(int i = 1; i < processors; i++) { 
400                                 //send number of file pairs
401                                 int num = dividedNames[i].size();
402                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
403                                 
404                                 for (int j = 0; j < num; j++) { //send filenames to process i
405                                         char tempDistFileName[1024];
406                                         strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
407                                         int lengthDist = (dividedNames[i][j].begin()->first).length();
408                                         
409                                         MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
410                                         MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
411                                         
412                                         char tempNameFileName[1024];
413                                         strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
414                                         int lengthName = (dividedNames[i][j].begin()->second).length();
415
416                                         MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
417                                         MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
418                                 }
419                         }
420                         
421                         //process your share
422                         listFileNames = cluster(dividedNames[0], labels);
423                         
424                         //receive the other processes info
425                         for(int i = 1; i < processors; i++) { 
426                                 int num = dividedNames[i].size();
427                                 
428                                 double tempCutoff;
429                                 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
430                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
431                                 
432                                 //send list filenames to root process
433                                 for (int j = 0; j < num; j++) {  
434                                         int lengthList = 0;
435                                         char tempListFileName[1024];
436                                 
437                                         MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
438                                         MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
439                                 
440                                         string myListFileName = tempListFileName;
441                                         myListFileName = myListFileName.substr(0, lengthList);
442                                         
443                                         listFileNames.push_back(myListFileName);
444                                 }
445                                 
446                                 //send Labels to root process
447                                 int numLabels = 0;
448                                 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
449                                 
450                                 for (int j = 0; j < numLabels; j++) {  
451                                         int lengthLabel = 0;
452                                         char tempLabel[100];
453                                 
454                                         MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
455                                         MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
456                                 
457                                         string myLabel = tempLabel;
458                                         myLabel = myLabel.substr(0, lengthLabel);
459                         
460                                         if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
461                                 }
462                         }
463                         
464                 }else { //you are a child process
465                         vector < map<string, string> >  myNames;
466                         
467                         //recieve the files you need to process
468                         //receive number of file pairs
469                         int num = 0;
470                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
471                         
472                         myNames.resize(num);
473         
474                         for (int j = 0; j < num; j++) { //receive filenames to process 
475                                 int lengthDist = 0;
476                                 char tempDistFileName[1024];
477                                 
478                                 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
479                                 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
480                                 
481                                 string myDistFileName = tempDistFileName;
482                                 myDistFileName = myDistFileName.substr(0, lengthDist);
483                         
484                                 int lengthName = 0;
485                                 char tempNameFileName[1024];
486                                 
487                                 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
488                                 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); 
489                                 
490                                 string myNameFileName = tempNameFileName;
491                                 myNameFileName = myNameFileName.substr(0, lengthName);
492                                 
493                                 //save file name
494                                 myNames[j][myDistFileName] = myNameFileName;
495                         }
496         
497                         //process them
498                         listFileNames = cluster(myNames, labels);
499                         
500                         //send cutoff
501                         MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
502                         
503                         //send list filenames to root process
504                         for (int j = 0; j < num; j++) {  
505                                 char tempListFileName[1024];
506                                 strcpy(tempListFileName, listFileNames[j].c_str());
507                                 int lengthList = listFileNames[j].length();
508                                         
509                                 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
510                                 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
511                         }
512                         
513                         //send Labels to root process
514                         int numLabels = labels.size();
515                         MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
516                         
517                         for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
518                                 char tempLabel[100];
519                                 strcpy(tempLabel, (*it).c_str());
520                                 int lengthLabel = (*it).length();
521                                         
522                                 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
523                                 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
524                         }
525                 }
526                 
527                 //make everyone wait
528                 MPI_Barrier(MPI_COMM_WORLD);
529                 
530         #else
531
532                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
533                                 if(processors == 1){
534                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
535                                 }else{
536                                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
537                                         dividedNames.resize(processors);
538                                         
539                                         //for each file group figure out which process will complete it
540                                         //want to divide the load intelligently so the big files are spread between processes
541                                         for (int i = 0; i < distName.size(); i++) { 
542                                                 int processToAssign = (i+1) % processors; 
543                                                 if (processToAssign == 0) { processToAssign = processors; }
544                                                 
545                                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
546                                         }
547                                         
548                                         //not lets reverse the order of ever other process, so we balance big files running with little ones
549                                         for (int i = 0; i < processors; i++) {
550                                                 int remainder = ((i+1) % processors);
551                                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
552                                         }
553                                         
554                                         createProcesses(dividedNames);
555                                                         
556                                         if (m->control_pressed) { return 0; }
557
558                                         //get list of list file names from each process
559                                         for(int i=0;i<processors;i++){
560                                                 string filename = toString(processIDS[i]) + ".temp";
561                                                 ifstream in;
562                                                 m->openInputFile(filename, in);
563                                                 
564                                                 in >> tag; m->gobble(in);
565                                                 
566                                                 while(!in.eof()) {
567                                                         string tempName;
568                                                         in >> tempName; m->gobble(in);
569                                                         listFileNames.push_back(tempName);
570                                                 }
571                                                 in.close();
572                                                 remove((toString(processIDS[i]) + ".temp").c_str());
573                                                 
574                                                 //get labels
575                                                 filename = toString(processIDS[i]) + ".temp.labels";
576                                                 ifstream in2;
577                                                 m->openInputFile(filename, in2);
578                                                 
579                                                 float tempCutoff;
580                                                 in2 >> tempCutoff; m->gobble(in2);
581                                                 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
582                                                 
583                                                 while(!in2.eof()) {
584                                                         string tempName;
585                                                         in2 >> tempName; m->gobble(in2);
586                                                         if (labels.count(tempName) == 0) { labels.insert(tempName); }
587                                                 }
588                                                 in2.close();
589                                                 remove((toString(processIDS[i]) + ".temp.labels").c_str());
590                                         }
591                                 }
592                 #else
593                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
594                 #endif
595         #endif  
596                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
597                 
598                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
599                 
600                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
601                 
602                 //****************** merge list file and create rabund and sabund files ******************************//
603                 estart = time(NULL);
604                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
605                 
606                 #ifdef USE_MPI
607                         if (pid == 0) { //only process 0 merges
608                 #endif
609
610                 ListVector* listSingle;
611                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
612                 
613                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
614                 
615                 mergeLists(listFileNames, labelBins, listSingle);
616
617                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
618                 
619                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
620                 
621                 m->mothurOutEndLine();
622                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
623                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
624                 m->mothurOutEndLine();
625                 
626                 #ifdef USE_MPI
627                         } //only process 0 merges
628                         
629                         //make everyone wait
630                         MPI_Barrier(MPI_COMM_WORLD);
631                 #endif
632
633                 return 0;
634         }
635         catch(exception& e) {
636                 m->errorOut(e, "ClusterSplitCommand", "execute");
637                 exit(1);
638         }
639 }
640 //**********************************************************************************************************************
641 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
642         try {
643                                 
644                 map<float, int> labelBin;
645                 vector<float> orderFloat;
646                 int numSingleBins;
647                 
648                 //read in singletons
649                 if (singleton != "none") {
650                         ifstream in;
651                         m->openInputFile(singleton, in);
652                                 
653                         string firstCol, secondCol;
654                         listSingle = new ListVector();
655                         while (!in.eof()) {
656                                 in >> firstCol >> secondCol; m->gobble(in);
657                                 listSingle->push_back(secondCol);
658                         }
659                         in.close();
660                         remove(singleton.c_str());
661                         
662                         numSingleBins = listSingle->getNumBins();
663                 }else{  listSingle = NULL; numSingleBins = 0;  }
664                 
665                 //go through users set and make them floats so we can sort them 
666                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
667                         float temp = -10.0;
668
669                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
670                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
671                         
672                         if (temp <= cutoff) {
673                                 orderFloat.push_back(temp);
674                                 labelBin[temp] = numSingleBins; //initialize numbins 
675                         }
676                 }
677         
678                 //sort order
679                 sort(orderFloat.begin(), orderFloat.end());
680                 userLabels.clear();
681                         
682                 //get the list info from each file
683                 for (int k = 0; k < listNames.size(); k++) {
684         
685                         if (m->control_pressed) {  
686                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
687                                 for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
688                                 return labelBin;
689                         }
690                         
691                         InputData* input = new InputData(listNames[k], "list");
692                         ListVector* list = input->getListVector();
693                         string lastLabel = list->getLabel();
694                         
695                         string filledInList = listNames[k] + "filledInTemp";
696                         ofstream outFilled;
697                         m->openOutputFile(filledInList, outFilled);
698         
699                         //for each label needed
700                         for(int l = 0; l < orderFloat.size(); l++){
701                         
702                                 string thisLabel;
703                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
704                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
705
706                                 //this file has reached the end
707                                 if (list == NULL) { 
708                                         list = input->getListVector(lastLabel, true); 
709                                 }else{  //do you have the distance, or do you need to fill in
710                                                 
711                                         float labelFloat;
712                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
713                                         else { convert(list->getLabel(), labelFloat); }
714
715                                         //check for missing labels
716                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
717                                                 //if its bigger get last label, otherwise keep it
718                                                 delete list;
719                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
720                                         }
721                                         lastLabel = list->getLabel();
722                                 }
723                                 
724                                 //print to new file
725                                 list->setLabel(thisLabel);
726                                 list->print(outFilled);
727                 
728                                 //update labelBin
729                                 labelBin[orderFloat[l]] += list->getNumBins();
730                                                                         
731                                 delete list;
732                                                                         
733                                 list = input->getListVector();
734                         }
735                         
736                         if (list != NULL) { delete list; }
737                         delete input;
738                         
739                         outFilled.close();
740                         remove(listNames[k].c_str());
741                         rename(filledInList.c_str(), listNames[k].c_str());
742                 }
743                 
744                 return labelBin;
745         }
746         catch(exception& e) {
747                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
748                 exit(1);
749         }
750 }
751 //**********************************************************************************************************************
752 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
753         try {
754                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
755                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
756                 
757                 m->openOutputFile(fileroot+ tag + ".sabund",    outSabund);
758                 m->openOutputFile(fileroot+ tag + ".rabund",    outRabund);
759                 m->openOutputFile(fileroot+ tag + ".list",              outList);
760                                 
761                 outputNames.push_back(fileroot+ tag + ".sabund");  outputTypes["list"].push_back(fileroot+ tag + ".list");
762                 outputNames.push_back(fileroot+ tag + ".rabund");  outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
763                 outputNames.push_back(fileroot+ tag + ".list");    outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
764                 
765                 map<float, int>::iterator itLabel;
766
767                 //for each label needed
768                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
769                         
770                         string thisLabel;
771                         if (itLabel->first == -1) { thisLabel = "unique"; }
772                         else { thisLabel = toString(itLabel->first,  length-1);  } 
773                         
774                         outList << thisLabel << '\t' << itLabel->second << '\t';
775
776                         RAbundVector* rabund = new RAbundVector();
777                         rabund->setLabel(thisLabel);
778
779                         //add in singletons
780                         if (listSingle != NULL) {
781                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
782                                         outList << listSingle->get(j) << '\t';
783                                         rabund->push_back(m->getNumNames(listSingle->get(j)));
784                                 }
785                         }
786                         
787                         //get the list info from each file
788                         for (int k = 0; k < listNames.size(); k++) {
789         
790                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
791                                 
792                                 InputData* input = new InputData(listNames[k], "list");
793                                 ListVector* list = input->getListVector(thisLabel);
794                                 
795                                 //this file has reached the end
796                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
797                                 else {          
798                                         for (int j = 0; j < list->getNumBins(); j++) {
799                                                 outList << list->get(j) << '\t';
800                                                 rabund->push_back(m->getNumNames(list->get(j)));
801                                         }
802                                         delete list;
803                                 }
804                                 delete input;
805                         }
806                         
807                         SAbundVector sabund = rabund->getSAbundVector();
808                         
809                         sabund.print(outSabund);
810                         rabund->print(outRabund);
811                         outList << endl;
812                         
813                         delete rabund;
814                 }
815                 
816                 outList.close();
817                 outRabund.close();
818                 outSabund.close();
819                 
820                 if (listSingle != NULL) { delete listSingle;  }
821                 
822                 for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
823                 
824                 return 0;
825         }
826         catch(exception& e) {
827                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
828                 exit(1);
829         }
830 }
831
832 //**********************************************************************************************************************
833
834 void ClusterSplitCommand::printData(ListVector* oldList){
835         try {
836                 string label = oldList->getLabel();
837                 RAbundVector oldRAbund = oldList->getRAbundVector();
838                 
839                 oldRAbund.setLabel(label);
840                 if (m->isTrue(showabund)) {
841                         oldRAbund.getSAbundVector().print(cout);
842                 }
843                 oldRAbund.print(outRabund);
844                 oldRAbund.getSAbundVector().print(outSabund);
845         
846                 oldList->print(outList);
847         }
848         catch(exception& e) {
849                 m->errorOut(e, "ClusterSplitCommand", "printData");
850                 exit(1);
851         }
852 }
853 //**********************************************************************************************************************
854 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
855         try {
856         
857         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
858                 int process = 0;
859                 int exitCommand = 1;
860                 processIDS.clear();
861                 
862                 //loop through and create all the processes you want
863                 while (process != processors) {
864                         int pid = fork();
865                         
866                         if (pid > 0) {
867                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
868                                 process++;
869                         }else if (pid == 0){
870                                 set<string> labels;
871                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
872                                 
873                                 //write out names to file
874                                 string filename = toString(getpid()) + ".temp";
875                                 ofstream out;
876                                 m->openOutputFile(filename, out);
877                                 out << tag << endl;
878                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
879                                 out.close();
880                                 
881                                 //print out labels
882                                 ofstream outLabels;
883                                 filename = toString(getpid()) + ".temp.labels";
884                                 m->openOutputFile(filename, outLabels);
885                                 
886                                 outLabels << cutoff << endl;
887                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
888                                         outLabels << (*it) << endl;
889                                 }
890                                 outLabels.close();
891
892                                 exit(0);
893                         }else { 
894                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
895                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
896                                 exit(0);
897                         }
898                 }
899                 
900                 //force parent to wait until all the processes are done
901                 for (int i=0;i<processors;i++) { 
902                         int temp = processIDS[i];
903                         wait(&temp);
904                 }
905                 
906                 return exitCommand;
907         #endif          
908         
909         }
910         catch(exception& e) {
911                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
912                 exit(1);
913         }
914 }
915 //**********************************************************************************************************************
916
917 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
918         try {
919                 Cluster* cluster;
920                 SparseMatrix* matrix;
921                 ListVector* list;
922                 ListVector oldList;
923                 RAbundVector* rabund;
924                 
925                 vector<string> listFileNames;
926                 
927                 double smallestCutoff = cutoff;
928                 
929                 //cluster each distance file
930                 for (int i = 0; i < distNames.size(); i++) {
931                         if (m->control_pressed) { return listFileNames; }
932                         
933                         string thisNamefile = distNames[i].begin()->second;
934                         string thisDistFile = distNames[i].begin()->first;
935                         
936                         //read in distance file
937                         globaldata->setNameFile(thisNamefile);
938                         globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
939                         
940                         #ifdef USE_MPI
941                                 int pid;
942                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
943                                 
944                                 //output your files too
945                                 if (pid != 0) {
946                                         cout << endl << "Reading " << thisDistFile << endl;
947                                 }
948                         #endif
949                         
950                         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
951                         
952                         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
953                         read->setCutoff(cutoff);
954
955                         NameAssignment* nameMap = new NameAssignment(thisNamefile);
956                         nameMap->readMap();
957                         read->read(nameMap);
958                         
959                         if (m->control_pressed) {  delete read; delete nameMap; return listFileNames; }
960                         
961                         list = read->getListVector();
962                         oldList = *list;
963                         matrix = read->getMatrix();
964                         
965                         delete read; 
966                         delete nameMap; 
967                         
968                         
969                         #ifdef USE_MPI
970                                 //output your files too
971                                 if (pid != 0) {
972                                         cout << endl << "Clustering " << thisDistFile << endl;
973                                 }
974                         #endif
975                         
976                         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
977                 
978                         rabund = new RAbundVector(list->getRAbundVector());
979                         
980                         //create cluster
981                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
982                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
983                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
984                         tag = cluster->getTag();
985                 
986                         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
987                         fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
988                         
989                         ofstream listFile;
990                         m->openOutputFile(fileroot+ tag + ".list",      listFile);
991                 
992                         listFileNames.push_back(fileroot+ tag + ".list");
993                                 
994                         float previousDist = 0.00000;
995                         float rndPreviousDist = 0.00000;
996                         
997                         oldList = *list;
998
999                         print_start = true;
1000                         start = time(NULL);
1001                         double saveCutoff = cutoff;
1002                 
1003                         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1004                 
1005                                 if (m->control_pressed) { //clean up
1006                                         delete matrix; delete list;     delete cluster; delete rabund;
1007                                         listFile.close();
1008                                         for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
1009                                         listFileNames.clear(); return listFileNames;
1010                                 }
1011                 
1012                                 cluster->update(saveCutoff);
1013         
1014                                 float dist = matrix->getSmallDist();
1015                                 float rndDist;
1016                                 if (hard) {
1017                                         rndDist = m->ceilDist(dist, precision); 
1018                                 }else{
1019                                         rndDist = m->roundDist(dist, precision); 
1020                                 }
1021
1022                                 if(previousDist <= 0.0000 && dist != previousDist){
1023                                         oldList.setLabel("unique");
1024                                         oldList.print(listFile);
1025                                         if (labels.count("unique") == 0) {  labels.insert("unique");  }
1026                                 }
1027                                 else if(rndDist != rndPreviousDist){
1028                                         oldList.setLabel(toString(rndPreviousDist,  length-1));
1029                                         oldList.print(listFile);
1030                                         if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1031                                 }
1032                 
1033                                 previousDist = dist;
1034                                 rndPreviousDist = rndDist;
1035                                 oldList = *list;
1036                         }
1037
1038                 
1039                         if(previousDist <= 0.0000){
1040                                 oldList.setLabel("unique");
1041                                 oldList.print(listFile);
1042                                 if (labels.count("unique") == 0) { labels.insert("unique"); }
1043                         }
1044                         else if(rndPreviousDist<cutoff){
1045                                 oldList.setLabel(toString(rndPreviousDist,  length-1));
1046                                 oldList.print(listFile);
1047                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
1048                         }
1049         
1050                         delete matrix; delete list;     delete cluster; delete rabund; 
1051                         listFile.close();
1052                         
1053                         if (m->control_pressed) { //clean up
1054                                 for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
1055                                 listFileNames.clear(); return listFileNames;
1056                         }
1057                         
1058                         remove(thisDistFile.c_str());
1059                         remove(thisNamefile.c_str());
1060                         
1061                         if (saveCutoff != cutoff) { 
1062                                 if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
1063                                 else            {       saveCutoff = m->roundDist(saveCutoff, precision);  }
1064                         
1065                                 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();  
1066                         }
1067                         
1068                         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1069                 }
1070                 
1071                 cutoff = smallestCutoff;
1072                                         
1073                 return listFileNames;
1074         
1075         }
1076         catch(exception& e) {
1077                 m->errorOut(e, "ClusterSplitCommand", "cluster");
1078                 exit(1);
1079         }
1080
1081
1082 }
1083 //**********************************************************************************************************************
1084
1085 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1086         try{
1087                 
1088 #ifdef USE_MPI
1089                 int pid;
1090                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1091                 
1092                 if (pid != 0) {
1093 #endif
1094                 
1095                 string thisOutputDir = outputDir;
1096                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1097                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
1098                 remove(outputFileName.c_str());
1099                 
1100                 
1101                 for (int i = 0; i < distNames.size(); i++) {
1102                         if (m->control_pressed) {  return 0; }
1103                         
1104                         string thisDistFile = distNames[i].begin()->first;
1105                         
1106                         m->appendFiles(thisDistFile, outputFileName);
1107                 }       
1108                         
1109                 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1110                         
1111 #ifdef USE_MPI
1112                 }
1113 #endif
1114                                 
1115                 return 0;       
1116                 
1117                 
1118         }
1119         catch(exception& e) {
1120                 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1121                 exit(1);
1122         }
1123 }
1124 //**********************************************************************************************************************