2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
19 //**********************************************************************************************************************
20 vector<string> ClusterSplitCommand::setParameters(){
22 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
23 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
24 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
25 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
26 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
27 CommandParameter ptaxlevel("taxlevel", "Number", "", "1", "", "", "",false,false); parameters.push_back(ptaxlevel);
28 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
29 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
30 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
31 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
32 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
33 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
34 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
35 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
36 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
37 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
38 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
40 vector<string> myArray;
41 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
45 m->errorOut(e, "ClusterSplitCommand", "setParameters");
49 //**********************************************************************************************************************
50 string ClusterSplitCommand::getHelpString(){
52 string helpString = "";
53 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n";
54 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
55 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
56 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n";
57 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
58 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n";
59 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
60 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
61 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
62 helpString += "The name parameter allows you to enter your name file and is required if your distance file is in column format. \n";
63 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n";
64 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
65 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
66 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
67 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
68 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n";
69 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
71 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
73 helpString += "The cluster.split command should be in the following format: \n";
74 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
75 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
79 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
83 //**********************************************************************************************************************
84 ClusterSplitCommand::ClusterSplitCommand(){
86 abort = true; calledHelp = true;
88 vector<string> tempOutNames;
89 outputTypes["list"] = tempOutNames;
90 outputTypes["rabund"] = tempOutNames;
91 outputTypes["sabund"] = tempOutNames;
92 outputTypes["column"] = tempOutNames;
95 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
99 //**********************************************************************************************************************
100 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
101 ClusterSplitCommand::ClusterSplitCommand(string option) {
103 abort = false; calledHelp = false;
106 //allow user to run help
107 if(option == "help") { help(); abort = true; calledHelp = true; }
110 vector<string> myArray = setParameters();
112 OptionParser parser(option);
113 map<string,string> parameters = parser.getParameters();
115 ValidParameters validParameter("cluster.split");
117 //check to make sure all parameters are valid for command
118 map<string,string>::iterator it;
119 for (it = parameters.begin(); it != parameters.end(); it++) {
120 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
125 //initialize outputTypes
126 vector<string> tempOutNames;
127 outputTypes["list"] = tempOutNames;
128 outputTypes["rabund"] = tempOutNames;
129 outputTypes["sabund"] = tempOutNames;
130 outputTypes["column"] = tempOutNames;
132 //if the user changes the output directory command factory will send this info to us in the output parameter
133 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
135 //if the user changes the input directory command factory will send this info to us in the output parameter
136 string inputDir = validParameter.validFile(parameters, "inputdir", false);
137 if (inputDir == "not found"){ inputDir = ""; }
140 it = parameters.find("phylip");
141 //user has given a template file
142 if(it != parameters.end()){
143 path = m->hasPath(it->second);
144 //if the user has not given a path then, add inputdir. else leave path alone.
145 if (path == "") { parameters["phylip"] = inputDir + it->second; }
148 it = parameters.find("column");
149 //user has given a template file
150 if(it != parameters.end()){
151 path = m->hasPath(it->second);
152 //if the user has not given a path then, add inputdir. else leave path alone.
153 if (path == "") { parameters["column"] = inputDir + it->second; }
156 it = parameters.find("name");
157 //user has given a template file
158 if(it != parameters.end()){
159 path = m->hasPath(it->second);
160 //if the user has not given a path then, add inputdir. else leave path alone.
161 if (path == "") { parameters["name"] = inputDir + it->second; }
164 it = parameters.find("taxonomy");
165 //user has given a template file
166 if(it != parameters.end()){
167 path = m->hasPath(it->second);
168 //if the user has not given a path then, add inputdir. else leave path alone.
169 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
172 it = parameters.find("fasta");
173 //user has given a template file
174 if(it != parameters.end()){
175 path = m->hasPath(it->second);
176 //if the user has not given a path then, add inputdir. else leave path alone.
177 if (path == "") { parameters["fasta"] = inputDir + it->second; }
181 //check for required parameters
182 phylipfile = validParameter.validFile(parameters, "phylip", true);
183 if (phylipfile == "not open") { abort = true; }
184 else if (phylipfile == "not found") { phylipfile = ""; }
185 else { distfile = phylipfile; format = "phylip"; }
187 columnfile = validParameter.validFile(parameters, "column", true);
188 if (columnfile == "not open") { abort = true; }
189 else if (columnfile == "not found") { columnfile = ""; }
190 else { distfile = columnfile; format = "column"; }
192 namefile = validParameter.validFile(parameters, "name", true);
193 if (namefile == "not open") { abort = true; }
194 else if (namefile == "not found") { namefile = ""; }
196 fastafile = validParameter.validFile(parameters, "fasta", true);
197 if (fastafile == "not open") { abort = true; }
198 else if (fastafile == "not found") { fastafile = ""; }
199 else { distfile = fastafile; splitmethod = "fasta"; }
201 taxFile = validParameter.validFile(parameters, "taxonomy", true);
202 if (taxFile == "not open") { abort = true; }
203 else if (taxFile == "not found") { taxFile = ""; }
205 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
206 //is there are current file available for either of these?
207 //give priority to column, then phylip, then fasta
208 columnfile = m->getColumnFile();
209 if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
211 phylipfile = m->getPhylipFile();
212 if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
214 fastafile = m->getFastaFile();
215 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
217 m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
223 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
225 if (columnfile != "") {
226 if (namefile == "") {
227 namefile = m->getNameFile();
228 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
230 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine();
236 if (fastafile != "") {
238 taxFile = m->getTaxonomyFile();
239 if (taxFile != "") { m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
241 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
246 if (namefile == "") {
247 namefile = m->getNameFile();
248 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
250 m->mothurOut("You need to provide a namefile if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
256 //check for optional parameter and set defaults
257 // ...at some point should added some additional type checking...
258 //get user cutoff and precision or use defaults
260 temp = validParameter.validFile(parameters, "precision", false);
261 if (temp == "not found") { temp = "100"; }
262 //saves precision legnth for formatting below
263 length = temp.length();
264 convert(temp, precision);
266 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
267 hard = m->isTrue(temp);
269 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
270 large = m->isTrue(temp);
272 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
273 m->setProcessors(temp);
274 convert(temp, processors);
276 temp = validParameter.validFile(parameters, "splitmethod", false);
277 if (splitmethod != "fasta") {
278 if (temp == "not found") { splitmethod = "distance"; }
279 else { splitmethod = temp; }
282 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
283 convert(temp, cutoff);
284 cutoff += (5 / (precision * 10.0));
286 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "1"; }
287 convert(temp, taxLevelCutoff);
289 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "average"; }
291 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
292 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
294 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
295 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
297 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
299 showabund = validParameter.validFile(parameters, "showabund", false);
300 if (showabund == "not found") { showabund = "T"; }
302 timing = validParameter.validFile(parameters, "timing", false);
303 if (timing == "not found") { timing = "F"; }
307 catch(exception& e) {
308 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
313 //**********************************************************************************************************************
315 int ClusterSplitCommand::execute(){
318 if (abort == true) { if (calledHelp) { return 0; } return 2; }
321 vector<string> listFileNames;
323 string singletonName = "";
324 double saveCutoff = cutoff;
326 //****************** file prep work ******************************//
331 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
332 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
334 if (pid == 0) { //only process 0 converts and splits
338 //if user gave a phylip file convert to column file
339 if (format == "phylip") {
341 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
343 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
345 NameAssignment* nameMap = NULL;
346 convert->setFormat("phylip");
347 convert->read(nameMap);
349 if (m->control_pressed) { delete convert; return 0; }
351 distfile = convert->getOutputFile();
353 //if no names file given with phylip file, create it
354 ListVector* listToMakeNameFile = convert->getListVector();
355 if (namefile == "") { //you need to make a namefile for split matrix
357 namefile = phylipfile + ".names";
358 m->openOutputFile(namefile, out);
359 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
360 string bin = listToMakeNameFile->get(i);
361 out << bin << '\t' << bin << endl;
365 delete listToMakeNameFile;
368 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
370 if (m->control_pressed) { return 0; }
373 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
375 //split matrix into non-overlapping groups
377 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); }
378 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); }
379 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir); }
380 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
384 if (m->control_pressed) { delete split; return 0; }
386 singletonName = split->getSingletonNames();
387 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
390 //output a merged distance file
391 if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
394 if (m->control_pressed) { return 0; }
396 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
399 //****************** break up files between processes and cluster each file set ******************************//
401 ////you are process 0 from above////
403 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
404 dividedNames.resize(processors);
406 //for each file group figure out which process will complete it
407 //want to divide the load intelligently so the big files are spread between processes
408 for (int i = 0; i < distName.size(); i++) {
409 int processToAssign = (i+1) % processors;
410 if (processToAssign == 0) { processToAssign = processors; }
412 dividedNames[(processToAssign-1)].push_back(distName[i]);
415 //not lets reverse the order of ever other process, so we balance big files running with little ones
416 for (int i = 0; i < processors; i++) {
417 int remainder = ((i+1) % processors);
418 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
422 //send each child the list of files it needs to process
423 for(int i = 1; i < processors; i++) {
424 //send number of file pairs
425 int num = dividedNames[i].size();
426 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
428 for (int j = 0; j < num; j++) { //send filenames to process i
429 char tempDistFileName[1024];
430 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
431 int lengthDist = (dividedNames[i][j].begin()->first).length();
433 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
434 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
436 char tempNameFileName[1024];
437 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
438 int lengthName = (dividedNames[i][j].begin()->second).length();
440 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
441 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
446 listFileNames = cluster(dividedNames[0], labels);
448 //receive the other processes info
449 for(int i = 1; i < processors; i++) {
450 int num = dividedNames[i].size();
453 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
454 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
456 //send list filenames to root process
457 for (int j = 0; j < num; j++) {
459 char tempListFileName[1024];
461 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
462 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
464 string myListFileName = tempListFileName;
465 myListFileName = myListFileName.substr(0, lengthList);
467 listFileNames.push_back(myListFileName);
470 //send Labels to root process
472 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
474 for (int j = 0; j < numLabels; j++) {
478 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
479 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
481 string myLabel = tempLabel;
482 myLabel = myLabel.substr(0, lengthLabel);
484 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
488 }else { //you are a child process
489 vector < map<string, string> > myNames;
491 //recieve the files you need to process
492 //receive number of file pairs
494 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
498 for (int j = 0; j < num; j++) { //receive filenames to process
500 char tempDistFileName[1024];
502 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
503 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
505 string myDistFileName = tempDistFileName;
506 myDistFileName = myDistFileName.substr(0, lengthDist);
509 char tempNameFileName[1024];
511 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
512 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
514 string myNameFileName = tempNameFileName;
515 myNameFileName = myNameFileName.substr(0, lengthName);
518 myNames[j][myDistFileName] = myNameFileName;
522 listFileNames = cluster(myNames, labels);
525 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
527 //send list filenames to root process
528 for (int j = 0; j < num; j++) {
529 char tempListFileName[1024];
530 strcpy(tempListFileName, listFileNames[j].c_str());
531 int lengthList = listFileNames[j].length();
533 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
534 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
537 //send Labels to root process
538 int numLabels = labels.size();
539 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
541 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
543 strcpy(tempLabel, (*it).c_str());
544 int lengthLabel = (*it).length();
546 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
547 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
552 MPI_Barrier(MPI_COMM_WORLD);
556 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
558 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
560 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
561 dividedNames.resize(processors);
563 //for each file group figure out which process will complete it
564 //want to divide the load intelligently so the big files are spread between processes
565 for (int i = 0; i < distName.size(); i++) {
566 int processToAssign = (i+1) % processors;
567 if (processToAssign == 0) { processToAssign = processors; }
569 dividedNames[(processToAssign-1)].push_back(distName[i]);
572 //not lets reverse the order of ever other process, so we balance big files running with little ones
573 for (int i = 0; i < processors; i++) {
574 int remainder = ((i+1) % processors);
575 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
578 createProcesses(dividedNames);
580 if (m->control_pressed) { return 0; }
582 //get list of list file names from each process
583 for(int i=0;i<processors;i++){
584 string filename = toString(processIDS[i]) + ".temp";
586 m->openInputFile(filename, in);
588 in >> tag; m->gobble(in);
592 in >> tempName; m->gobble(in);
593 listFileNames.push_back(tempName);
596 remove((toString(processIDS[i]) + ".temp").c_str());
599 filename = toString(processIDS[i]) + ".temp.labels";
601 m->openInputFile(filename, in2);
604 in2 >> tempCutoff; m->gobble(in2);
605 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
609 in2 >> tempName; m->gobble(in2);
610 if (labels.count(tempName) == 0) { labels.insert(tempName); }
613 remove((toString(processIDS[i]) + ".temp.labels").c_str());
617 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
620 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
622 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
624 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
626 //****************** merge list file and create rabund and sabund files ******************************//
628 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
631 if (pid == 0) { //only process 0 merges
634 ListVector* listSingle;
635 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
637 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
639 mergeLists(listFileNames, labelBins, listSingle);
641 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
643 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
645 //set list file as new current listfile
647 itTypes = outputTypes.find("list");
648 if (itTypes != outputTypes.end()) {
649 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
652 //set rabund file as new current rabundfile
653 itTypes = outputTypes.find("rabund");
654 if (itTypes != outputTypes.end()) {
655 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
658 //set sabund file as new current sabundfile
659 itTypes = outputTypes.find("sabund");
660 if (itTypes != outputTypes.end()) {
661 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
664 m->mothurOutEndLine();
665 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
666 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
667 m->mothurOutEndLine();
670 } //only process 0 merges
673 MPI_Barrier(MPI_COMM_WORLD);
678 catch(exception& e) {
679 m->errorOut(e, "ClusterSplitCommand", "execute");
683 //**********************************************************************************************************************
684 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
687 map<float, int> labelBin;
688 vector<float> orderFloat;
692 if (singleton != "none") {
694 m->openInputFile(singleton, in);
696 string firstCol, secondCol;
697 listSingle = new ListVector();
699 in >> firstCol >> secondCol; m->gobble(in);
700 listSingle->push_back(secondCol);
703 remove(singleton.c_str());
705 numSingleBins = listSingle->getNumBins();
706 }else{ listSingle = NULL; numSingleBins = 0; }
708 //go through users set and make them floats so we can sort them
709 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
712 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
713 else if (*it == "unique") { temp = -1.0; }
715 if (temp <= cutoff) {
716 orderFloat.push_back(temp);
717 labelBin[temp] = numSingleBins; //initialize numbins
722 sort(orderFloat.begin(), orderFloat.end());
725 //get the list info from each file
726 for (int k = 0; k < listNames.size(); k++) {
728 if (m->control_pressed) {
729 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str()); }
730 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
734 InputData* input = new InputData(listNames[k], "list");
735 ListVector* list = input->getListVector();
736 string lastLabel = list->getLabel();
738 string filledInList = listNames[k] + "filledInTemp";
740 m->openOutputFile(filledInList, outFilled);
742 //for each label needed
743 for(int l = 0; l < orderFloat.size(); l++){
746 if (orderFloat[l] == -1) { thisLabel = "unique"; }
747 else { thisLabel = toString(orderFloat[l], length-1); }
749 //this file has reached the end
751 list = input->getListVector(lastLabel, true);
752 }else{ //do you have the distance, or do you need to fill in
755 if (list->getLabel() == "unique") { labelFloat = -1.0; }
756 else { convert(list->getLabel(), labelFloat); }
758 //check for missing labels
759 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
760 //if its bigger get last label, otherwise keep it
762 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
764 lastLabel = list->getLabel();
768 list->setLabel(thisLabel);
769 list->print(outFilled);
772 labelBin[orderFloat[l]] += list->getNumBins();
776 list = input->getListVector();
779 if (list != NULL) { delete list; }
783 remove(listNames[k].c_str());
784 rename(filledInList.c_str(), listNames[k].c_str());
789 catch(exception& e) {
790 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
794 //**********************************************************************************************************************
795 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
797 if (outputDir == "") { outputDir += m->hasPath(distfile); }
798 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
800 m->openOutputFile(fileroot+ tag + ".sabund", outSabund);
801 m->openOutputFile(fileroot+ tag + ".rabund", outRabund);
802 m->openOutputFile(fileroot+ tag + ".list", outList);
804 outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["list"].push_back(fileroot+ tag + ".list");
805 outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
806 outputNames.push_back(fileroot+ tag + ".list"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
808 map<float, int>::iterator itLabel;
810 //for each label needed
811 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
814 if (itLabel->first == -1) { thisLabel = "unique"; }
815 else { thisLabel = toString(itLabel->first, length-1); }
817 outList << thisLabel << '\t' << itLabel->second << '\t';
819 RAbundVector* rabund = new RAbundVector();
820 rabund->setLabel(thisLabel);
823 if (listSingle != NULL) {
824 for (int j = 0; j < listSingle->getNumBins(); j++) {
825 outList << listSingle->get(j) << '\t';
826 rabund->push_back(m->getNumNames(listSingle->get(j)));
830 //get the list info from each file
831 for (int k = 0; k < listNames.size(); k++) {
833 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); } delete rabund; return 0; }
835 InputData* input = new InputData(listNames[k], "list");
836 ListVector* list = input->getListVector(thisLabel);
838 //this file has reached the end
839 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
841 for (int j = 0; j < list->getNumBins(); j++) {
842 outList << list->get(j) << '\t';
843 rabund->push_back(m->getNumNames(list->get(j)));
850 SAbundVector sabund = rabund->getSAbundVector();
852 sabund.print(outSabund);
853 rabund->print(outRabund);
863 if (listSingle != NULL) { delete listSingle; }
865 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
869 catch(exception& e) {
870 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
875 //**********************************************************************************************************************
877 void ClusterSplitCommand::printData(ListVector* oldList){
879 string label = oldList->getLabel();
880 RAbundVector oldRAbund = oldList->getRAbundVector();
882 oldRAbund.setLabel(label);
883 if (m->isTrue(showabund)) {
884 oldRAbund.getSAbundVector().print(cout);
886 oldRAbund.print(outRabund);
887 oldRAbund.getSAbundVector().print(outSabund);
889 oldList->print(outList);
891 catch(exception& e) {
892 m->errorOut(e, "ClusterSplitCommand", "printData");
896 //**********************************************************************************************************************
897 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
900 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
905 //loop through and create all the processes you want
906 while (process != processors) {
910 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
914 vector<string> listFileNames = cluster(dividedNames[process], labels);
916 //write out names to file
917 string filename = toString(getpid()) + ".temp";
919 m->openOutputFile(filename, out);
921 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
926 filename = toString(getpid()) + ".temp.labels";
927 m->openOutputFile(filename, outLabels);
929 outLabels << cutoff << endl;
930 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
931 outLabels << (*it) << endl;
937 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
938 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
943 //force parent to wait until all the processes are done
944 for (int i=0;i<processors;i++) {
945 int temp = processIDS[i];
953 catch(exception& e) {
954 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
958 //**********************************************************************************************************************
960 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
963 SparseMatrix* matrix;
966 RAbundVector* rabund;
968 vector<string> listFileNames;
970 double smallestCutoff = cutoff;
972 //cluster each distance file
973 for (int i = 0; i < distNames.size(); i++) {
974 if (m->control_pressed) { return listFileNames; }
976 string thisNamefile = distNames[i].begin()->second;
977 string thisDistFile = distNames[i].begin()->first;
981 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
983 //output your files too
985 cout << endl << "Reading " << thisDistFile << endl;
989 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
991 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
992 read->setCutoff(cutoff);
994 NameAssignment* nameMap = new NameAssignment(thisNamefile);
998 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
1000 list = read->getListVector();
1002 matrix = read->getMatrix();
1009 //output your files too
1011 cout << endl << "Clustering " << thisDistFile << endl;
1015 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1017 rabund = new RAbundVector(list->getRAbundVector());
1020 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1021 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1022 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
1023 tag = cluster->getTag();
1025 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1026 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1029 m->openOutputFile(fileroot+ tag + ".list", listFile);
1031 listFileNames.push_back(fileroot+ tag + ".list");
1033 float previousDist = 0.00000;
1034 float rndPreviousDist = 0.00000;
1040 double saveCutoff = cutoff;
1042 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1044 if (m->control_pressed) { //clean up
1045 delete matrix; delete list; delete cluster; delete rabund;
1047 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1048 listFileNames.clear(); return listFileNames;
1051 cluster->update(saveCutoff);
1053 float dist = matrix->getSmallDist();
1056 rndDist = m->ceilDist(dist, precision);
1058 rndDist = m->roundDist(dist, precision);
1061 if(previousDist <= 0.0000 && dist != previousDist){
1062 oldList.setLabel("unique");
1063 oldList.print(listFile);
1064 if (labels.count("unique") == 0) { labels.insert("unique"); }
1066 else if(rndDist != rndPreviousDist){
1067 oldList.setLabel(toString(rndPreviousDist, length-1));
1068 oldList.print(listFile);
1069 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1072 previousDist = dist;
1073 rndPreviousDist = rndDist;
1078 if(previousDist <= 0.0000){
1079 oldList.setLabel("unique");
1080 oldList.print(listFile);
1081 if (labels.count("unique") == 0) { labels.insert("unique"); }
1083 else if(rndPreviousDist<cutoff){
1084 oldList.setLabel(toString(rndPreviousDist, length-1));
1085 oldList.print(listFile);
1086 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1089 delete matrix; delete list; delete cluster; delete rabund;
1092 if (m->control_pressed) { //clean up
1093 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1094 listFileNames.clear(); return listFileNames;
1097 remove(thisDistFile.c_str());
1098 remove(thisNamefile.c_str());
1100 if (saveCutoff != cutoff) {
1101 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1102 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1104 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1107 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1110 cutoff = smallestCutoff;
1112 return listFileNames;
1115 catch(exception& e) {
1116 m->errorOut(e, "ClusterSplitCommand", "cluster");
1122 //**********************************************************************************************************************
1124 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1129 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1134 string thisOutputDir = outputDir;
1135 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1136 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
1137 remove(outputFileName.c_str());
1140 for (int i = 0; i < distNames.size(); i++) {
1141 if (m->control_pressed) { return 0; }
1143 string thisDistFile = distNames[i].begin()->first;
1145 m->appendFiles(thisDistFile, outputFileName);
1148 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1158 catch(exception& e) {
1159 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1163 //**********************************************************************************************************************