]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.cpp
fixed bug in cluster.split
[mothur.git] / clustersplitcommand.cpp
1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
17
18 //**********************************************************************************************************************
19 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
20 ClusterSplitCommand::ClusterSplitCommand(string option)  {
21         try{
22                 globaldata = GlobalData::getInstance();
23                 abort = false;
24                 format = "";
25                 
26                 //allow user to run help
27                 if(option == "help") { help(); abort = true; }
28                 
29                 else {
30                         //valid paramters for this command
31                         string Array[] =  {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
32                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
33                         
34                         OptionParser parser(option);
35                         map<string,string> parameters = parser.getParameters();
36                         
37                         ValidParameters validParameter;
38                 
39                         //check to make sure all parameters are valid for command
40                         map<string,string>::iterator it;
41                         for (it = parameters.begin(); it != parameters.end(); it++) { 
42                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
43                                         abort = true;
44                                 }
45                         }
46                         
47                         globaldata->newRead();
48                         
49                         //if the user changes the output directory command factory will send this info to us in the output parameter 
50                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
51                         
52                                 //if the user changes the input directory command factory will send this info to us in the output parameter 
53                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
54                         if (inputDir == "not found"){   inputDir = "";          }
55                         else {
56                                 string path;
57                                 it = parameters.find("phylip");
58                                 //user has given a template file
59                                 if(it != parameters.end()){ 
60                                         path = hasPath(it->second);
61                                         //if the user has not given a path then, add inputdir. else leave path alone.
62                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
63                                 }
64                                 
65                                 it = parameters.find("column");
66                                 //user has given a template file
67                                 if(it != parameters.end()){ 
68                                         path = hasPath(it->second);
69                                         //if the user has not given a path then, add inputdir. else leave path alone.
70                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
71                                 }
72                                 
73                                 it = parameters.find("name");
74                                 //user has given a template file
75                                 if(it != parameters.end()){ 
76                                         path = hasPath(it->second);
77                                         //if the user has not given a path then, add inputdir. else leave path alone.
78                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
79                                 }
80                                 
81                                 it = parameters.find("taxonomy");
82                                 //user has given a template file
83                                 if(it != parameters.end()){ 
84                                         path = hasPath(it->second);
85                                         //if the user has not given a path then, add inputdir. else leave path alone.
86                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
87                                 }
88                                 
89                                 it = parameters.find("fasta");
90                                 //user has given a template file
91                                 if(it != parameters.end()){ 
92                                         path = hasPath(it->second);
93                                         //if the user has not given a path then, add inputdir. else leave path alone.
94                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
95                                 }
96                         }
97                         
98                         //check for required parameters
99                         phylipfile = validParameter.validFile(parameters, "phylip", true);
100                         if (phylipfile == "not open") { abort = true; }
101                         else if (phylipfile == "not found") { phylipfile = ""; }        
102                         else {  distfile = phylipfile;  format = "phylip";      }
103                         
104                         columnfile = validParameter.validFile(parameters, "column", true);
105                         if (columnfile == "not open") { abort = true; } 
106                         else if (columnfile == "not found") { columnfile = ""; }
107                         else {  distfile = columnfile; format = "column";       }
108                         
109                         namefile = validParameter.validFile(parameters, "name", true);
110                         if (namefile == "not open") { abort = true; }   
111                         else if (namefile == "not found") { namefile = ""; }
112                         
113                         fastafile = validParameter.validFile(parameters, "fasta", true);
114                         if (fastafile == "not open") { abort = true; }  
115                         else if (fastafile == "not found") { fastafile = ""; }
116                         else { splitmethod = "fasta";  }
117                         
118                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
119                         if (taxFile == "not open") { abort = true; }    
120                         else if (taxFile == "not found") { taxFile = ""; }
121                         
122                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
123                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
124                 
125                         if (columnfile != "") {
126                                 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
127                         }
128                         
129                         if (fastafile != "") {
130                                 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
131                                 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
132                         }
133                                         
134                         //check for optional parameter and set defaults
135                         // ...at some point should added some additional type checking...
136                         //get user cutoff and precision or use defaults
137                         string temp;
138                         temp = validParameter.validFile(parameters, "precision", false);
139                         if (temp == "not found") { temp = "100"; }
140                         //saves precision legnth for formatting below
141                         length = temp.length();
142                         convert(temp, precision); 
143                         
144                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
145                         hard = isTrue(temp);
146                         
147                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
148                         large = isTrue(temp);
149                         
150                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
151                         convert(temp, processors); 
152                         
153                         temp = validParameter.validFile(parameters, "splitmethod", false);      
154                         if (splitmethod != "fasta") {
155                                 if (temp == "not found")  { splitmethod = "distance"; }
156                                 else {  splitmethod = temp; }
157                         }
158                         
159                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
160                         convert(temp, cutoff); 
161                         cutoff += (5 / (precision * 10.0));  
162                         
163                         temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
164                         convert(temp, taxLevelCutoff); 
165                         
166                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
167                         
168                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
169                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
170                         
171                         if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
172                         else { m->mothurOut(splitmethod + " is not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
173                         
174                         if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
175
176                         showabund = validParameter.validFile(parameters, "showabund", false);
177                         if (showabund == "not found") { showabund = "T"; }
178
179                         timing = validParameter.validFile(parameters, "timing", false);
180                         if (timing == "not found") { timing = "F"; }
181                         
182                 }
183         }
184         catch(exception& e) {
185                 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
186                 exit(1);
187         }
188 }
189
190 //**********************************************************************************************************************
191
192 void ClusterSplitCommand::help(){
193         try {
194                 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
195                 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
196                 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
197                 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n");
198                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and split the distance file based on those groups. \n");
199                 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n");
200                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
201                 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
202                 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
203                 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
204                 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
205                 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
206                 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
207                 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
208                 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
209                 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n");
210                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
211                 m->mothurOut("The cluster.split command should be in the following format: \n");
212                 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
213                 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");       
214
215         }
216         catch(exception& e) {
217                 m->errorOut(e, "ClusterSplitCommand", "help");
218                 exit(1);
219         }
220 }
221
222 //**********************************************************************************************************************
223
224 ClusterSplitCommand::~ClusterSplitCommand(){}
225
226 //**********************************************************************************************************************
227
228 int ClusterSplitCommand::execute(){
229         try {
230         
231                 if (abort == true) {    return 0;       }
232                 time_t estart;
233                 //****************** file prep work ******************************//
234                 
235                 //if user gave a phylip file convert to column file
236                 if (format == "phylip") {
237                         estart = time(NULL);
238                         m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
239                         
240                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
241                         
242                         NameAssignment* nameMap = NULL;
243                         convert->setFormat("phylip");
244                         convert->read(nameMap);
245                         
246                         if (m->control_pressed) {  delete convert;  return 0;  }
247                         
248                         distfile = convert->getOutputFile();
249                 
250                         //if no names file given with phylip file, create it
251                         ListVector* listToMakeNameFile =  convert->getListVector();
252                         if (namefile == "") {  //you need to make a namefile for split matrix
253                                 ofstream out;
254                                 namefile = phylipfile + ".names";
255                                 openOutputFile(namefile, out);
256                                 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
257                                         string bin = listToMakeNameFile->get(i);
258                                         out << bin << '\t' << bin << endl;
259                                 }
260                                 out.close();
261                         }
262                         delete listToMakeNameFile;
263                         delete convert;
264                         
265                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
266                 }
267                 if (m->control_pressed) { return 0; }
268                 
269                 estart = time(NULL);
270                 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
271                 
272                 //split matrix into non-overlapping groups
273                 SplitMatrix* split;
274                 if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                       }
275                 else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                       }
276                 else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, splitmethod, processors);         }
277                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
278                 
279                 split->split();
280                 
281                 if (m->control_pressed) { delete split; return 0; }
282                 
283                 string singletonName = split->getSingletonNames();
284                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
285                 delete split;
286                 
287                 if (m->control_pressed) { return 0; }
288                 
289                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
290                 estart = time(NULL);
291                 
292                 //****************** break up files between processes and cluster each file set ******************************//
293                 vector<string> listFileNames;
294                 set<string> labels;
295                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
296                                 if(processors == 1){
297                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
298                                 }else{
299                                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
300                                         dividedNames.resize(processors);
301                                         
302                                         //for each file group figure out which process will complete it
303                                         //want to divide the load intelligently so the big files are spread between processes
304                                         int count = 1;
305                                         for (int i = 0; i < distName.size(); i++) { 
306                                                 int processToAssign = (i+1) % processors; 
307                                                 if (processToAssign == 0) { processToAssign = processors; }
308                                                 
309                                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
310                                         }
311                                         
312                                         //not lets reverse the order of ever other process, so we balance big files running with little ones
313                                         for (int i = 0; i < processors; i++) {
314                                                 int remainder = ((i+1) % processors);
315                                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
316                                         }
317                                         
318                                         createProcesses(dividedNames);
319                                                         
320                                         if (m->control_pressed) { return 0; }
321
322                                         //get list of list file names from each process
323                                         for(int i=0;i<processors;i++){
324                                                 string filename = toString(processIDS[i]) + ".temp";
325                                                 ifstream in;
326                                                 openInputFile(filename, in);
327                                                 
328                                                 in >> tag; gobble(in);
329                                                 
330                                                 while(!in.eof()) {
331                                                         string tempName;
332                                                         in >> tempName; gobble(in);
333                                                         listFileNames.push_back(tempName);
334                                                 }
335                                                 in.close();
336                                                 remove((toString(processIDS[i]) + ".temp").c_str());
337                                                 
338                                                 //get labels
339                                                 filename = toString(processIDS[i]) + ".temp.labels";
340                                                 ifstream in2;
341                                                 openInputFile(filename, in2);
342                                                 
343                                                 while(!in2.eof()) {
344                                                         string tempName;
345                                                         in2 >> tempName; gobble(in2);
346                                                         if (labels.count(tempName) == 0) { labels.insert(tempName); }
347                                                 }
348                                                 in2.close();
349                                                 remove((toString(processIDS[i]) + ".temp.labels").c_str());
350                                         }
351                                 }
352                 #else
353                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
354                 #endif
355                 
356                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
357                 
358                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
359                 
360                 //****************** merge list file and create rabund and sabund files ******************************//
361                 estart = time(NULL);
362                 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
363                 
364                 ListVector* listSingle;
365                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
366                 
367                 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
368                 
369                 mergeLists(listFileNames, labelBins, listSingle);
370
371                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
372                 
373                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
374                 
375                 m->mothurOutEndLine();
376                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
377                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
378                 m->mothurOutEndLine();
379
380                 return 0;
381         }
382         catch(exception& e) {
383                 m->errorOut(e, "ClusterSplitCommand", "execute");
384                 exit(1);
385         }
386 }
387 //**********************************************************************************************************************
388 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
389         try {
390                                 
391                 map<float, int> labelBin;
392                 vector<float> orderFloat;
393                 int numSingleBins;
394                 
395                 //read in singletons
396                 if (singleton != "none") {
397                         ifstream in;
398                         openInputFile(singleton, in);
399                                 
400                         string firstCol, secondCol;
401                         listSingle = new ListVector();
402                         while (!in.eof()) {
403                                 in >> firstCol >> secondCol; gobble(in);
404                                 listSingle->push_back(secondCol);
405                         }
406                         in.close();
407                         remove(singleton.c_str());
408                         
409                         numSingleBins = listSingle->getNumBins();
410                 }else{  listSingle = NULL; numSingleBins = 0;  }
411                 
412                 //go through users set and make them floats so we can sort them 
413                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
414                         float temp = -10.0;
415
416                         if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
417                         else if (*it == "unique")                                                                               {       temp = -1.0;            }
418                                                 
419                         orderFloat.push_back(temp);
420                         labelBin[temp] = numSingleBins; //initialize numbins 
421                 }
422         
423                 //sort order
424                 sort(orderFloat.begin(), orderFloat.end());
425                 userLabels.clear();
426                         
427                 //get the list info from each file
428                 for (int k = 0; k < listNames.size(); k++) {
429         
430                         if (m->control_pressed) {  
431                                 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
432                                 for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
433                                 return labelBin;
434                         }
435                         
436                         InputData* input = new InputData(listNames[k], "list");
437                         ListVector* list = input->getListVector();
438                         string lastLabel = list->getLabel();
439                         
440                         string filledInList = listNames[k] + "filledInTemp";
441                         ofstream outFilled;
442                         openOutputFile(filledInList, outFilled);
443         
444                         //for each label needed
445                         for(int l = 0; l < orderFloat.size(); l++){
446                         
447                                 string thisLabel;
448                                 if (orderFloat[l] == -1) { thisLabel = "unique"; }
449                                 else { thisLabel = toString(orderFloat[l],  length-1);  } 
450
451                                 //this file has reached the end
452                                 if (list == NULL) { 
453                                         list = input->getListVector(lastLabel, true); 
454                                 }else{  //do you have the distance, or do you need to fill in
455                                                 
456                                         float labelFloat;
457                                         if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
458                                         else { convert(list->getLabel(), labelFloat); }
459
460                                         //check for missing labels
461                                         if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
462                                                 //if its bigger get last label, otherwise keep it
463                                                 delete list;
464                                                 list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
465                                         }
466                                         lastLabel = list->getLabel();
467                                 }
468                                 
469                                 //print to new file
470                                 list->setLabel(thisLabel);
471                                 list->print(outFilled);
472                 
473                                 //update labelBin
474                                 labelBin[orderFloat[l]] += list->getNumBins();
475                                                                         
476                                 delete list;
477                                                                         
478                                 list = input->getListVector();
479                         }
480                         
481                         if (list != NULL) { delete list; }
482                         delete input;
483                         
484                         outFilled.close();
485                         remove(listNames[k].c_str());
486                         rename(filledInList.c_str(), listNames[k].c_str());
487                 }
488                 
489                 return labelBin;
490         }
491         catch(exception& e) {
492                 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
493                 exit(1);
494         }
495 }
496 //**********************************************************************************************************************
497 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
498         try {
499                 if (outputDir == "") { outputDir += hasPath(distfile); }
500                 fileroot = outputDir + getRootName(getSimpleName(distfile));
501                 
502                 openOutputFile(fileroot+ tag + ".sabund",       outSabund);
503                 openOutputFile(fileroot+ tag + ".rabund",       outRabund);
504                 openOutputFile(fileroot+ tag + ".list",         outList);
505                                 
506                 outputNames.push_back(fileroot+ tag + ".sabund");
507                 outputNames.push_back(fileroot+ tag + ".rabund");
508                 outputNames.push_back(fileroot+ tag + ".list");
509                 
510                 map<float, int>::iterator itLabel;
511
512                 //for each label needed
513                 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
514                         
515                         string thisLabel;
516                         if (itLabel->first == -1) { thisLabel = "unique"; }
517                         else { thisLabel = toString(itLabel->first,  length-1);  } 
518                         
519                         outList << thisLabel << '\t' << itLabel->second << '\t';
520
521                         RAbundVector* rabund = new RAbundVector();
522                         rabund->setLabel(thisLabel);
523
524                         //add in singletons
525                         if (listSingle != NULL) {
526                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
527                                         outList << listSingle->get(j) << '\t';
528                                         rabund->push_back(getNumNames(listSingle->get(j)));
529                                 }
530                         }
531                         
532                         //get the list info from each file
533                         for (int k = 0; k < listNames.size(); k++) {
534         
535                                 if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
536                                 
537                                 InputData* input = new InputData(listNames[k], "list");
538                                 ListVector* list = input->getListVector(thisLabel);
539                                 
540                                 //this file has reached the end
541                                 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
542                                 else {          
543                                         for (int j = 0; j < list->getNumBins(); j++) {
544                                                 outList << list->get(j) << '\t';
545                                                 rabund->push_back(getNumNames(list->get(j)));
546                                         }
547                                         delete list;
548                                 }
549                                 delete input;
550                         }
551                         
552                         SAbundVector sabund = rabund->getSAbundVector();
553                         
554                         sabund.print(outSabund);
555                         rabund->print(outRabund);
556                         outList << endl;
557                         
558                         delete rabund;
559                 }
560                 
561                 outList.close();
562                 outRabund.close();
563                 outSabund.close();
564                 
565                 if (listSingle != NULL) { delete listSingle;  }
566                 
567                 for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
568                 
569                 return 0;
570         }
571         catch(exception& e) {
572                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
573                 exit(1);
574         }
575 }
576
577 //**********************************************************************************************************************
578
579 void ClusterSplitCommand::printData(ListVector* oldList){
580         try {
581                 string label = oldList->getLabel();
582                 RAbundVector oldRAbund = oldList->getRAbundVector();
583                 
584                 oldRAbund.setLabel(label);
585                 if (isTrue(showabund)) {
586                         oldRAbund.getSAbundVector().print(cout);
587                 }
588                 oldRAbund.print(outRabund);
589                 oldRAbund.getSAbundVector().print(outSabund);
590         
591                 oldList->print(outList);
592         }
593         catch(exception& e) {
594                 m->errorOut(e, "ClusterSplitCommand", "printData");
595                 exit(1);
596         }
597 }
598 //**********************************************************************************************************************
599 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
600         try {
601         
602         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
603                 int process = 0;
604                 int exitCommand = 1;
605                 processIDS.clear();
606                 
607                 //loop through and create all the processes you want
608                 while (process != processors) {
609                         int pid = fork();
610                         
611                         if (pid > 0) {
612                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
613                                 process++;
614                         }else if (pid == 0){
615                                 set<string> labels;
616                                 vector<string> listFileNames = cluster(dividedNames[process], labels);
617                                 
618                                 //write out names to file
619                                 string filename = toString(getpid()) + ".temp";
620                                 ofstream out;
621                                 openOutputFile(filename, out);
622                                 out << tag << endl;
623                                 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
624                                 out.close();
625                                 
626                                 //print out labels
627                                 ofstream outLabels;
628                                 filename = toString(getpid()) + ".temp.labels";
629                                 openOutputFile(filename, outLabels);
630                 
631                                 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
632                                         outLabels << (*it) << endl;
633                                 }
634                                 outLabels.close();
635
636                                 exit(0);
637                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
638                 }
639                 
640                 //force parent to wait until all the processes are done
641                 for (int i=0;i<processors;i++) { 
642                         int temp = processIDS[i];
643                         wait(&temp);
644                 }
645                 
646                 return exitCommand;
647         #endif          
648         
649         }
650         catch(exception& e) {
651                 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
652                 exit(1);
653         }
654 }
655 //**********************************************************************************************************************
656
657 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
658         try {
659                 Cluster* cluster;
660                 SparseMatrix* matrix;
661                 ListVector* list;
662                 ListVector oldList;
663                 RAbundVector* rabund;
664                 
665                 vector<string> listFileNames;
666                 
667                 //cluster each distance file
668                 for (int i = 0; i < distNames.size(); i++) {
669                         
670                         string thisNamefile = distNames[i].begin()->second;
671                         string thisDistFile = distNames[i].begin()->first;
672                         
673                         //read in distance file
674                         globaldata->setNameFile(thisNamefile);
675                         globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
676                         
677                         m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
678                         
679                         ReadMatrix* read = new ReadColumnMatrix(thisDistFile);  
680                         read->setCutoff(cutoff);
681
682                         NameAssignment* nameMap = new NameAssignment(thisNamefile);
683                         nameMap->readMap();
684                         read->read(nameMap);
685                         
686                         if (m->control_pressed) {  delete read; delete nameMap; return listFileNames; }
687                         
688                         list = read->getListVector();
689                         oldList = *list;
690                         matrix = read->getMatrix();
691                         
692                         delete read; 
693                         delete nameMap; 
694                         
695                         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
696                 
697                         rabund = new RAbundVector(list->getRAbundVector());
698                         
699                         //create cluster
700                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
701                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
702                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
703                         tag = cluster->getTag();
704                 
705                         if (outputDir == "") { outputDir += hasPath(thisDistFile); }
706                         fileroot = outputDir + getRootName(getSimpleName(thisDistFile));
707                         
708                         ofstream listFile;
709                         openOutputFile(fileroot+ tag + ".list", listFile);
710                 
711                         listFileNames.push_back(fileroot+ tag + ".list");
712                 
713                         time_t estart = time(NULL);
714                         
715                         float previousDist = 0.00000;
716                         float rndPreviousDist = 0.00000;
717                         
718                         oldList = *list;
719
720                         print_start = true;
721                         start = time(NULL);
722                         double saveCutoff = cutoff;
723                 
724                         while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
725                 
726                                 if (m->control_pressed) { //clean up
727                                         delete matrix; delete list;     delete cluster; delete rabund;
728                                         listFile.close();
729                                         for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
730                                         listFileNames.clear(); return listFileNames;
731                                 }
732                 
733                                 cluster->update(cutoff);
734         
735                                 float dist = matrix->getSmallDist();
736                                 float rndDist;
737                                 if (hard) {
738                                         rndDist = ceilDist(dist, precision); 
739                                 }else{
740                                         rndDist = roundDist(dist, precision); 
741                                 }
742
743                                 if(previousDist <= 0.0000 && dist != previousDist){
744                                         oldList.setLabel("unique");
745                                         oldList.print(listFile);
746                                         if (labels.count("unique") == 0) {  labels.insert("unique");  }
747                                 }
748                                 else if(rndDist != rndPreviousDist){
749                                         oldList.setLabel(toString(rndPreviousDist,  length-1));
750                                         oldList.print(listFile);
751                                         if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
752                                 }
753                 
754                                 previousDist = dist;
755                                 rndPreviousDist = rndDist;
756                                 oldList = *list;
757                         }
758
759                 
760                         if(previousDist <= 0.0000){
761                                 oldList.setLabel("unique");
762                                 oldList.print(listFile);
763                                 if (labels.count("unique") == 0) { labels.insert("unique"); }
764                         }
765                         else if(rndPreviousDist<cutoff){
766                                 oldList.setLabel(toString(rndPreviousDist,  length-1));
767                                 oldList.print(listFile);
768                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
769                         }
770         
771                         delete matrix; delete list;     delete cluster; delete rabund; 
772                         listFile.close();
773                         
774                         if (m->control_pressed) { //clean up
775                                 for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
776                                 listFileNames.clear(); return listFileNames;
777                         }
778                         
779                         remove(thisDistFile.c_str());
780                         remove(thisNamefile.c_str());
781                 }
782                 
783                                         
784                 return listFileNames;
785         
786         }
787         catch(exception& e) {
788                 m->errorOut(e, "ClusterSplitCommand", "cluster");
789                 exit(1);
790         }
791
792
793 }
794
795 //**********************************************************************************************************************