]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
fixed bug in cluster.split with classify method
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
17
18 //**********************************************************************************************************************
19 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
20 ClusterSplitCommand::ClusterSplitCommand(string option)  {
21         try{
22                 globaldata = GlobalData::getInstance();
23                 abort = false;
24                 format = "";
25                 
26                 //allow user to run help
27                 if(option == "help") { help(); abort = true; }
28                 
29                 else {
30                         //valid paramters for this command
31                         string Array[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
32                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
33                         
34                         OptionParser parser(option);
35                         map<string,string> parameters = parser.getParameters();
36                         
37                         ValidParameters validParameter;
38                 
39                         //check to make sure all parameters are valid for command
40                         map<string,string>::iterator it;
41                         for (it = parameters.begin(); it != parameters.end(); it++) { 
42                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
43                                         abort = true;
44                                 }
45                         }
46                         
47                         globaldata->newRead();
48                         
49                         //if the user changes the output directory command factory will send this info to us in the output parameter 
50                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
51                         
52                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
53                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
54                         if (inputDir == "not found"){   inputDir = "";          }
55                         else {
56                                 string path;
57                                 it = parameters.find("phylip");
58                                 //user has given a template file
59                                 if(it != parameters.end()){ 
60                                         path = hasPath(it->second);
61                                         //if the user has not given a path then, add inputdir. else leave path alone.
62                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
63                                 }
64                                 
65                                 it = parameters.find("column");
66                                 //user has given a template file
67                                 if(it != parameters.end()){ 
68                                         path = hasPath(it->second);
69                                         //if the user has not given a path then, add inputdir. else leave path alone.
70                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
71                                 }
72                                 
73                                 it = parameters.find("name");
74                                 //user has given a template file
75                                 if(it != parameters.end()){ 
76                                         path = hasPath(it->second);
77                                         //if the user has not given a path then, add inputdir. else leave path alone.
78                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
79                                 }
80                                 
81                                 it = parameters.find("taxonomy");
82                                 //user has given a template file
83                                 if(it != parameters.end()){ 
84                                         path = hasPath(it->second);
85                                         //if the user has not given a path then, add inputdir. else leave path alone.
86                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
87                                 }
88                                 
89                                 it = parameters.find("fasta");
90                                 //user has given a template file
91                                 if(it != parameters.end()){ 
92                                         path = hasPath(it->second);
93                                         //if the user has not given a path then, add inputdir. else leave path alone.
94                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
95                                 }
96                         }
97                         
98                         //check for required parameters
99                         phylipfile = validParameter.validFile(parameters, "phylip", true);
100                         if (phylipfile == "not open") { abort = true; }
101                         else if (phylipfile == "not found") { phylipfile = ""; }        
102                         else {  distfile = phylipfile;  format = "phylip";      }
103                         
104                         columnfile = validParameter.validFile(parameters, "column", true);
105                         if (columnfile == "not open") { abort = true; } 
106                         else if (columnfile == "not found") { columnfile = ""; }
107                         else {  distfile = columnfile; format = "column";       }
108                         
109                         namefile = validParameter.validFile(parameters, "name", true);
110                         if (namefile == "not open") { abort = true; }   
111                         else if (namefile == "not found") { namefile = ""; }
112                         
113                         fastafile = validParameter.validFile(parameters, "fasta", true);
114                         if (fastafile == "not open") { abort = true; }  
115                         else if (fastafile == "not found") { fastafile = ""; }
116                         else { splitmethod = "fasta";  }
117                         
118                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
119                         if (taxFile == "not open") { abort = true; }    
120                         else if (taxFile == "not found") { taxFile = ""; }
121                         
122                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
123                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
124                 
125                         if (columnfile != "") {
126                                 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
127                         }
128                         
129                         if (fastafile != "") {
130                                 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
131                         }
132                                         
133                         //check for optional parameter and set defaults
134                         // ...at some point should added some additional type checking...
135                         //get user cutoff and precision or use defaults
136                         string temp;
137                         temp = validParameter.validFile(parameters, "precision", false);
138                         if (temp == "not found") { temp = "100"; }
139                         //saves precision legnth for formatting below
140                         length = temp.length();
141                         convert(temp, precision); 
142                         
143                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
144                         hard = isTrue(temp);
145                         
146                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
147                         large = isTrue(temp);
148                         
149                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
150                         convert(temp, processors); 
151                         
152                         temp = validParameter.validFile(parameters, "splitmethod", false);      
153                         if (splitmethod != "fasta") {
154                                 if (temp == "not found")  { splitmethod = "distance"; }
155                                 else {  splitmethod = temp; }
156                         }
157                         
158                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
159                         convert(temp, cutoff); 
160                         cutoff += (5 / (precision * 10.0));  
161                         
162                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
163                         convert(temp, taxLevelCutoff); 
164                         
165                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
166                         
167                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
168                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
169                         
170                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
171                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
172                         
173                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
174
175                         showabund = validParameter.validFile(parameters, "showabund", false);
176                         if (showabund == "not found") { showabund = "T"; }
177
178                         timing = validParameter.validFile(parameters, "timing", false);
179                         if (timing == "not found") { timing = "F"; }
180                         
181                 }
182         }
183         catch(exception& e) {
184                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
185                 exit(1);
186         }
187 }
188
189 //**********************************************************************************************************************
190
191 void ClusterSplitCommand::help(){
192         try {
193                 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
194                 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
195                 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
196                 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n");
197                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and split the distance file based on those groups. \n");
198                 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file and taxonomy file.  \n");
199                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
200                 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
201                 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
202                 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
203                 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
204                 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
205                 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
206                 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
207                 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
208                 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n");
209                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
210                 m->mothurOut("The cluster.split command should be in the following format: \n");
211                 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
212                 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");       
213
214         }
215         catch(exception& e) {
216                 m->errorOut(e, "ClusterSplitCommand", "help");
217                 exit(1);
218         }
219 }
220
221 //**********************************************************************************************************************
222
223 ClusterSplitCommand::~ClusterSplitCommand(){}
224
225 //**********************************************************************************************************************
226
227 int ClusterSplitCommand::execute(){
228         try {
229         
230                 if (abort == true) {    return 0;       }
231                 time_t estart;
232                 //****************** file prep work ******************************//
233                 
234                 //if user gave a phylip file convert to column file
235                 if (format == "phylip") {
236                         estart = time(NULL);
237                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
238                         
239                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
240                         
241                         NameAssignment* nameMap = NULL;
242                         convert->setFormat("phylip");
243                         convert->read(nameMap);
244                         
245                         if (m->control_pressed) {  delete convert;  return 0;  }
246                         
247                         distfile = convert->getOutputFile();
248                 
249                         //if no names file given with phylip file, create it
250                         ListVector* listToMakeNameFile =  convert->getListVector();
251                         if (namefile == "") {  //you need to make a namefile for split matrix
252                                 ofstream out;
253                                 namefile = phylipfile + ".names";
254                                 openOutputFile(namefile, out);
255                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
256                                         string bin = listToMakeNameFile->get(i);
257                                         out << bin << '\t' << bin << endl;
258                                 }
259                                 out.close();
260                         }
261                         delete listToMakeNameFile;
262                         delete convert;
263                         
264                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
265                 }
266                 if (m->control_pressed) { return 0; }
267                 
268                 estart = time(NULL);
269                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
270                 
271                 //split matrix into non-overlapping groups
272                 SplitMatrix* split;
273                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                       }
274                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);   }
275                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, taxFile, taxLevelCutoff, splitmethod);                                       }
276                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
277                 
278                 split->split();
279                 
280                 if (m->control_pressed) { delete split; return 0; }
281                 
282                 string singletonName = split->getSingletonNames();
283                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
284                 delete split;
285                 
286                 if (m->control_pressed) { return 0; }
287                 
288                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
289                 estart = time(NULL);
290                 
291                 //****************** break up files between processes and cluster each file set ******************************//
292                 vector<string> listFileNames;
293                 set<string> labels;
294                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
295                                 if(processors == 1){
296                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
297                                 }else{
298                                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
299                                         dividedNames.resize(processors);
300                                         
301                                         //for each file group figure out which process will complete it
302                                         //want to divide the load intelligently so the big files are spread between processes
303                                         int count = 1;
304                                         for (int i = 0; i < distName.size(); i++) { 
305                                                 int processToAssign = (i+1) % processors; 
306                                                 if (processToAssign == 0) { processToAssign = processors; }
307                                                 
308                                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
309                                         }
310                                         
311                                         //not lets reverse the order of ever other process, so we balance big files running with little ones
312                                         for (int i = 0; i < processors; i++) {
313                                                 int remainder = ((i+1) % processors);
314                                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
315                                         }
316                                         
317                                         createProcesses(dividedNames);
318                                                         
319                                         if (m->control_pressed) { return 0; }
320
321                                         //get list of list file names from each process
322                                         for(int i=0;i<processors;i++){
323                                                 string filename = toString(processIDS[i]) + ".temp";
324                                                 ifstream in;
325                                                 openInputFile(filename, in);
326                                                 
327                                                 in >> tag; gobble(in);
328                                                 
329                                                 while(!in.eof()) {
330                                                         string tempName;
331                                                         in >> tempName; gobble(in);
332                                                         listFileNames.push_back(tempName);
333                                                 }
334                                                 in.close();
335                                                 remove((toString(processIDS[i]) + ".temp").c_str());
336                                                 
337                                                 //get labels
338                                                 filename = toString(processIDS[i]) + ".temp.labels";
339                                                 ifstream in2;
340                                                 openInputFile(filename, in2);
341                                                 
342                                                 while(!in2.eof()) {
343                                                         string tempName;
344                                                         in2 >> tempName; gobble(in2);
345                                                         if (labels.count(tempName) == 0) { labels.insert(tempName); }
346                                                 }
347                                                 in2.close();
348                                                 remove((toString(processIDS[i]) + ".temp.labels").c_str());
349                                         }
350                                 }
351                 #else
352                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
353                 #endif
354                 
355                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
356                 
357                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
358                 
359                 //****************** merge list file and create rabund and sabund files ******************************//
360                 estart = time(NULL);
361                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
362                 
363                 ListVector* listSingle;
364                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
365                 
366                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
367                 
368                 mergeLists(listFileNames, labelBins, listSingle);
369
370                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
371                 
372                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
373                 
374                 m->mothurOutEndLine();
375                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
376                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
377                 m->mothurOutEndLine();
378
379                 return 0;
380         }
381         catch(exception& e) {
382                 m->errorOut(e, "ClusterSplitCommand", "execute");
383                 exit(1);
384         }
385 }
386 //**********************************************************************************************************************
387 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
388         try {
389                                 
390                 map<float, int> labelBin;
391                 vector<float> orderFloat;
392                 int numSingleBins;
393                 
394                 //read in singletons
395                 if (singleton != "none") {
396                         ifstream in;
397                         openInputFile(singleton, in);
398                                 
399                         string firstCol, secondCol;
400                         listSingle = new ListVector();
401                         while (!in.eof()) {
402                                 in >> firstCol >> secondCol; gobble(in);
403                                 listSingle->push_back(secondCol);
404                         }
405                         in.close();
406                         remove(singleton.c_str());
407                         
408                         numSingleBins = listSingle->getNumBins();
409                 }else{  listSingle = NULL; numSingleBins = 0;  }
410                 
411                 //go through users set and make them floats so we can sort them 
412                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
413                         float temp = -10.0;
414
415                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
416                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
417                                                 
418                         orderFloat.push_back(temp);
419                         labelBin[temp] = numSingleBins; //initialize numbins 
420                 }
421         
422                 //sort order
423                 sort(orderFloat.begin(), orderFloat.end());
424                 userLabels.clear();
425                         
426                 //get the list info from each file
427                 for (int k = 0; k < listNames.size(); k++) {
428         
429                         if (m->control_pressed) {  
430                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
431                                 for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
432                                 return labelBin;
433                         }
434                         
435                         InputData* input = new InputData(listNames[k], "list");
436                         ListVector* list = input->getListVector();
437                         string lastLabel = list->getLabel();
438                         
439                         string filledInList = listNames[k] + "filledInTemp";
440                         ofstream outFilled;
441                         openOutputFile(filledInList, outFilled);
442         
443                         //for each label needed
444                         for(int l = 0; l < orderFloat.size(); l++){
445                         
446                                 string thisLabel;
447                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
448                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
449
450                                 //this file has reached the end
451                                 if (list == NULL) { 
452                                         list = input->getListVector(lastLabel, true); 
453                                 }else{  //do you have the distance, or do you need to fill in
454                                                 
455                                         float labelFloat;
456                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
457                                         else { convert(list->getLabel(), labelFloat); }
458
459                                         //check for missing labels
460                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
461                                                 //if its bigger get last label, otherwise keep it
462                                                 delete list;
463                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
464                                         }
465                                         lastLabel = list->getLabel();
466                                 }
467                                 
468                                 //print to new file
469                                 list->setLabel(thisLabel);
470                                 list->print(outFilled);
471                 
472                                 //update labelBin
473                                 labelBin[orderFloat[l]] += list->getNumBins();
474                                                                         
475                                 delete list;
476                                                                         
477                                 list = input->getListVector();
478                         }
479                         
480                         if (list != NULL) { delete list; }
481                         delete input;
482                         
483                         outFilled.close();
484                         remove(listNames[k].c_str());
485                         rename(filledInList.c_str(), listNames[k].c_str());
486                 }
487                 
488                 return labelBin;
489         }
490         catch(exception& e) {
491                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
492                 exit(1);
493         }
494 }
495 //**********************************************************************************************************************
496 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
497         try {
498                 if (outputDir == "") { outputDir += hasPath(distfile); }
499                 fileroot = outputDir + getRootName(getSimpleName(distfile));
500                 
501                 openOutputFile(fileroot+ tag + ".sabund",       outSabund);
502                 openOutputFile(fileroot+ tag + ".rabund",       outRabund);
503                 openOutputFile(fileroot+ tag + ".list",         outList);
504                                 
505                 outputNames.push_back(fileroot+ tag + ".sabund");
506                 outputNames.push_back(fileroot+ tag + ".rabund");
507                 outputNames.push_back(fileroot+ tag + ".list");
508                 
509                 map<float, int>::iterator itLabel;
510
511                 //for each label needed
512                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
513                         
514                         string thisLabel;
515                         if (itLabel->first == -1) { thisLabel = "unique"; }
516                         else { thisLabel = toString(itLabel->first,  length-1);  } 
517                         
518                         outList << thisLabel << '\t' << itLabel->second << '\t';
519
520                         RAbundVector* rabund = new RAbundVector();
521                         rabund->setLabel(thisLabel);
522
523                         //add in singletons
524                         if (listSingle != NULL) {
525                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
526                                         outList << listSingle->get(j) << '\t';
527                                         rabund->push_back(getNumNames(listSingle->get(j)));
528                                 }
529                         }
530                         
531                         //get the list info from each file
532                         for (int k = 0; k < listNames.size(); k++) {
533         
534                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
535                                 
536                                 InputData* input = new InputData(listNames[k], "list");
537                                 ListVector* list = input->getListVector(thisLabel);
538                                 
539                                 //this file has reached the end
540                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
541                                 else {          
542                                         for (int j = 0; j < list->getNumBins(); j++) {
543                                                 outList << list->get(j) << '\t';
544                                                 rabund->push_back(getNumNames(list->get(j)));
545                                         }
546                                         delete list;
547                                 }
548                                 delete input;
549                         }
550                         
551                         SAbundVector sabund = rabund->getSAbundVector();
552                         
553                         sabund.print(outSabund);
554                         rabund->print(outRabund);
555                         outList << endl;
556                         
557                         delete rabund;
558                 }
559                 
560                 outList.close();
561                 outRabund.close();
562                 outSabund.close();
563                 
564                 if (listSingle != NULL) { delete listSingle;  }
565                 
566                 for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
567                 
568                 return 0;
569         }
570         catch(exception& e) {
571                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
572                 exit(1);
573         }
574 }
575
576 //**********************************************************************************************************************
577
578 void ClusterSplitCommand::printData(ListVector* oldList){
579         try {
580                 string label = oldList->getLabel();
581                 RAbundVector oldRAbund = oldList->getRAbundVector();
582                 
583                 oldRAbund.setLabel(label);
584                 if (isTrue(showabund)) {
585                         oldRAbund.getSAbundVector().print(cout);
586                 }
587                 oldRAbund.print(outRabund);
588                 oldRAbund.getSAbundVector().print(outSabund);
589         
590                 oldList->print(outList);
591         }
592         catch(exception& e) {
593                 m->errorOut(e, "ClusterSplitCommand", "printData");
594                 exit(1);
595         }
596 }
597 //**********************************************************************************************************************
598 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
599         try {
600         
601         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
602                 int process = 0;
603                 int exitCommand = 1;
604                 processIDS.clear();
605                 
606                 //loop through and create all the processes you want
607                 while (process != processors) {
608                         int pid = fork();
609                         
610                         if (pid > 0) {
611                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
612                                 process++;
613                         }else if (pid == 0){
614                                 set<string> labels;
615                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
616                                 
617                                 //write out names to file
618                                 string filename = toString(getpid()) + ".temp";
619                                 ofstream out;
620                                 openOutputFile(filename, out);
621                                 out << tag << endl;
622                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
623                                 out.close();
624                                 
625                                 //print out labels
626                                 ofstream outLabels;
627                                 filename = toString(getpid()) + ".temp.labels";
628                                 openOutputFile(filename, outLabels);
629                 
630                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
631                                         outLabels << (*it) << endl;
632                                 }
633                                 outLabels.close();
634
635                                 exit(0);
636                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
637                 }
638                 
639                 //force parent to wait until all the processes are done
640                 for (int i=0;i<processors;i++) { 
641                         int temp = processIDS[i];
642                         wait(&temp);
643                 }
644                 
645                 return exitCommand;
646         #endif          
647         
648         }
649         catch(exception& e) {
650                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
651                 exit(1);
652         }
653 }
654 //**********************************************************************************************************************
655
656 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
657         try {
658                 Cluster* cluster;
659                 SparseMatrix* matrix;
660                 ListVector* list;
661                 ListVector oldList;
662                 RAbundVector* rabund;
663                 
664                 vector<string> listFileNames;
665                 
666                 //cluster each distance file
667                 for (int i = 0; i < distNames.size(); i++) {
668                         
669                         string thisNamefile = distNames[i].begin()->second;
670                         string thisDistFile = distNames[i].begin()->first;
671                         
672                         //read in distance file
673                         globaldata->setNameFile(thisNamefile);
674                         globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
675                         
676                         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
677                         
678                         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
679                         read->setCutoff(cutoff);
680
681                         NameAssignment* nameMap = new NameAssignment(thisNamefile);
682                         nameMap->readMap();
683                         read->read(nameMap);
684                         
685                         if (m->control_pressed) {  delete read; delete nameMap; return listFileNames; }
686                         
687                         list = read->getListVector();
688                         oldList = *list;
689                         matrix = read->getMatrix();
690                         
691                         delete read; 
692                         delete nameMap; 
693                         
694                         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
695                 
696                         rabund = new RAbundVector(list->getRAbundVector());
697                         
698                         //create cluster
699                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
700                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
701                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
702                         tag = cluster->getTag();
703                 
704                         if (outputDir == "") { outputDir += hasPath(thisDistFile); }
705                         fileroot = outputDir + getRootName(getSimpleName(thisDistFile));
706                         
707                         ofstream listFile;
708                         openOutputFile(fileroot+ tag + ".list", listFile);
709                 
710                         listFileNames.push_back(fileroot+ tag + ".list");
711                 
712                         time_t estart = time(NULL);
713                         
714                         float previousDist = 0.00000;
715                         float rndPreviousDist = 0.00000;
716                         
717                         oldList = *list;
718
719                         print_start = true;
720                         start = time(NULL);
721                         double saveCutoff = cutoff;
722                 
723                         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
724                 
725                                 if (m->control_pressed) { //clean up
726                                         delete matrix; delete list;     delete cluster; delete rabund;
727                                         listFile.close();
728                                         for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
729                                         listFileNames.clear(); return listFileNames;
730                                 }
731                 
732                                 cluster->update(cutoff);
733         
734                                 float dist = matrix->getSmallDist();
735                                 float rndDist;
736                                 if (hard) {
737                                         rndDist = ceilDist(dist, precision); 
738                                 }else{
739                                         rndDist = roundDist(dist, precision); 
740                                 }
741
742                                 if(previousDist <= 0.0000 && dist != previousDist){
743                                         oldList.setLabel("unique");
744                                         oldList.print(listFile);
745                                         if (labels.count("unique") == 0) {  labels.insert("unique");  }
746                                 }
747                                 else if(rndDist != rndPreviousDist){
748                                         oldList.setLabel(toString(rndPreviousDist,  length-1));
749                                         oldList.print(listFile);
750                                         if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
751                                 }
752                 
753                                 previousDist = dist;
754                                 rndPreviousDist = rndDist;
755                                 oldList = *list;
756                         }
757
758                 
759                         if(previousDist <= 0.0000){
760                                 oldList.setLabel("unique");
761                                 oldList.print(listFile);
762                                 if (labels.count("unique") == 0) { labels.insert("unique"); }
763                         }
764                         else if(rndPreviousDist<cutoff){
765                                 oldList.setLabel(toString(rndPreviousDist,  length-1));
766                                 oldList.print(listFile);
767                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
768                         }
769         
770                         delete matrix; delete list;     delete cluster; delete rabund; 
771                         listFile.close();
772                         
773                         if (m->control_pressed) { //clean up
774                                 for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
775                                 listFileNames.clear(); return listFileNames;
776                         }
777                         
778                         //remove(thisDistFile.c_str());
779                         //remove(thisNamefile.c_str());
780                 }
781                 
782                                         
783                 return listFileNames;
784         
785         }
786         catch(exception& e) {
787                 m->errorOut(e, "ClusterSplitCommand", "cluster");
788                 exit(1);
789         }
790
791
792 }
793
794 //**********************************************************************************************************************