2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
19 //**********************************************************************************************************************
20 vector<string> ClusterSplitCommand::setParameters(){
22 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
23 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
24 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
25 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
26 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
27 CommandParameter ptaxlevel("taxlevel", "Number", "", "1", "", "", "",false,false); parameters.push_back(ptaxlevel);
28 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
29 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
30 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
31 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
32 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
33 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
34 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
35 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
36 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
37 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
38 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
40 vector<string> myArray;
41 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
45 m->errorOut(e, "ClusterSplitCommand", "setParameters");
49 //**********************************************************************************************************************
50 string ClusterSplitCommand::getHelpString(){
52 string helpString = "";
53 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n";
54 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
55 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
56 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n";
57 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
58 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n";
59 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
60 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
61 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
62 helpString += "The name parameter allows you to enter your name file and is required if your distance file is in column format. \n";
63 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n";
64 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
65 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
66 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
67 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
68 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n";
69 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
71 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
73 helpString += "The cluster.split command should be in the following format: \n";
74 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
75 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
79 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
83 //**********************************************************************************************************************
84 ClusterSplitCommand::ClusterSplitCommand(){
86 abort = true; calledHelp = true;
88 vector<string> tempOutNames;
89 outputTypes["list"] = tempOutNames;
90 outputTypes["rabund"] = tempOutNames;
91 outputTypes["sabund"] = tempOutNames;
92 outputTypes["column"] = tempOutNames;
95 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
99 //**********************************************************************************************************************
100 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
101 ClusterSplitCommand::ClusterSplitCommand(string option) {
103 abort = false; calledHelp = false;
106 //allow user to run help
107 if(option == "help") { help(); abort = true; calledHelp = true; }
108 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
111 vector<string> myArray = setParameters();
113 OptionParser parser(option);
114 map<string,string> parameters = parser.getParameters();
116 ValidParameters validParameter("cluster.split");
118 //check to make sure all parameters are valid for command
119 map<string,string>::iterator it;
120 for (it = parameters.begin(); it != parameters.end(); it++) {
121 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
126 //initialize outputTypes
127 vector<string> tempOutNames;
128 outputTypes["list"] = tempOutNames;
129 outputTypes["rabund"] = tempOutNames;
130 outputTypes["sabund"] = tempOutNames;
131 outputTypes["column"] = tempOutNames;
133 //if the user changes the output directory command factory will send this info to us in the output parameter
134 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
136 //if the user changes the input directory command factory will send this info to us in the output parameter
137 string inputDir = validParameter.validFile(parameters, "inputdir", false);
138 if (inputDir == "not found"){ inputDir = ""; }
141 it = parameters.find("phylip");
142 //user has given a template file
143 if(it != parameters.end()){
144 path = m->hasPath(it->second);
145 //if the user has not given a path then, add inputdir. else leave path alone.
146 if (path == "") { parameters["phylip"] = inputDir + it->second; }
149 it = parameters.find("column");
150 //user has given a template file
151 if(it != parameters.end()){
152 path = m->hasPath(it->second);
153 //if the user has not given a path then, add inputdir. else leave path alone.
154 if (path == "") { parameters["column"] = inputDir + it->second; }
157 it = parameters.find("name");
158 //user has given a template file
159 if(it != parameters.end()){
160 path = m->hasPath(it->second);
161 //if the user has not given a path then, add inputdir. else leave path alone.
162 if (path == "") { parameters["name"] = inputDir + it->second; }
165 it = parameters.find("taxonomy");
166 //user has given a template file
167 if(it != parameters.end()){
168 path = m->hasPath(it->second);
169 //if the user has not given a path then, add inputdir. else leave path alone.
170 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
173 it = parameters.find("fasta");
174 //user has given a template file
175 if(it != parameters.end()){
176 path = m->hasPath(it->second);
177 //if the user has not given a path then, add inputdir. else leave path alone.
178 if (path == "") { parameters["fasta"] = inputDir + it->second; }
182 //check for required parameters
183 phylipfile = validParameter.validFile(parameters, "phylip", true);
184 if (phylipfile == "not open") { abort = true; }
185 else if (phylipfile == "not found") { phylipfile = ""; }
186 else { distfile = phylipfile; format = "phylip"; }
188 columnfile = validParameter.validFile(parameters, "column", true);
189 if (columnfile == "not open") { abort = true; }
190 else if (columnfile == "not found") { columnfile = ""; }
191 else { distfile = columnfile; format = "column"; }
193 namefile = validParameter.validFile(parameters, "name", true);
194 if (namefile == "not open") { abort = true; }
195 else if (namefile == "not found") { namefile = ""; }
197 fastafile = validParameter.validFile(parameters, "fasta", true);
198 if (fastafile == "not open") { abort = true; }
199 else if (fastafile == "not found") { fastafile = ""; }
200 else { distfile = fastafile; splitmethod = "fasta"; }
202 taxFile = validParameter.validFile(parameters, "taxonomy", true);
203 if (taxFile == "not open") { abort = true; }
204 else if (taxFile == "not found") { taxFile = ""; }
206 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
207 //is there are current file available for either of these?
208 //give priority to column, then phylip, then fasta
209 columnfile = m->getColumnFile();
210 if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
212 phylipfile = m->getPhylipFile();
213 if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
215 fastafile = m->getFastaFile();
216 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
218 m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
224 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
226 if (columnfile != "") {
227 if (namefile == "") {
228 namefile = m->getNameFile();
229 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
231 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine();
237 if (fastafile != "") {
239 taxFile = m->getTaxonomyFile();
240 if (taxFile != "") { m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
242 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
247 if (namefile == "") {
248 namefile = m->getNameFile();
249 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
251 m->mothurOut("You need to provide a namefile if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
257 //check for optional parameter and set defaults
258 // ...at some point should added some additional type checking...
259 //get user cutoff and precision or use defaults
261 temp = validParameter.validFile(parameters, "precision", false);
262 if (temp == "not found") { temp = "100"; }
263 //saves precision legnth for formatting below
264 length = temp.length();
265 convert(temp, precision);
267 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
268 hard = m->isTrue(temp);
270 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
271 large = m->isTrue(temp);
273 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
274 m->setProcessors(temp);
275 convert(temp, processors);
277 temp = validParameter.validFile(parameters, "splitmethod", false);
278 if (splitmethod != "fasta") {
279 if (temp == "not found") { splitmethod = "distance"; }
280 else { splitmethod = temp; }
283 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
284 convert(temp, cutoff);
285 cutoff += (5 / (precision * 10.0));
287 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "1"; }
288 convert(temp, taxLevelCutoff);
290 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "average"; }
292 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
293 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
295 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
296 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
298 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
300 showabund = validParameter.validFile(parameters, "showabund", false);
301 if (showabund == "not found") { showabund = "T"; }
303 timing = validParameter.validFile(parameters, "timing", false);
304 if (timing == "not found") { timing = "F"; }
308 catch(exception& e) {
309 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
314 //**********************************************************************************************************************
316 int ClusterSplitCommand::execute(){
319 if (abort == true) { if (calledHelp) { return 0; } return 2; }
322 vector<string> listFileNames;
324 string singletonName = "";
325 double saveCutoff = cutoff;
327 //****************** file prep work ******************************//
332 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
333 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
335 if (pid == 0) { //only process 0 converts and splits
339 //if user gave a phylip file convert to column file
340 if (format == "phylip") {
342 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
344 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
346 NameAssignment* nameMap = NULL;
347 convert->setFormat("phylip");
348 convert->read(nameMap);
350 if (m->control_pressed) { delete convert; return 0; }
352 distfile = convert->getOutputFile();
354 //if no names file given with phylip file, create it
355 ListVector* listToMakeNameFile = convert->getListVector();
356 if (namefile == "") { //you need to make a namefile for split matrix
358 namefile = phylipfile + ".names";
359 m->openOutputFile(namefile, out);
360 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
361 string bin = listToMakeNameFile->get(i);
362 out << bin << '\t' << bin << endl;
366 delete listToMakeNameFile;
369 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
371 if (m->control_pressed) { return 0; }
374 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
376 //split matrix into non-overlapping groups
378 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); }
379 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); }
380 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir); }
381 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
385 if (m->control_pressed) { delete split; return 0; }
387 singletonName = split->getSingletonNames();
388 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
391 //output a merged distance file
392 if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
395 if (m->control_pressed) { return 0; }
397 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
400 //****************** break up files between processes and cluster each file set ******************************//
402 ////you are process 0 from above////
404 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
405 dividedNames.resize(processors);
407 //for each file group figure out which process will complete it
408 //want to divide the load intelligently so the big files are spread between processes
409 for (int i = 0; i < distName.size(); i++) {
410 int processToAssign = (i+1) % processors;
411 if (processToAssign == 0) { processToAssign = processors; }
413 dividedNames[(processToAssign-1)].push_back(distName[i]);
416 //not lets reverse the order of ever other process, so we balance big files running with little ones
417 for (int i = 0; i < processors; i++) {
418 int remainder = ((i+1) % processors);
419 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
423 //send each child the list of files it needs to process
424 for(int i = 1; i < processors; i++) {
425 //send number of file pairs
426 int num = dividedNames[i].size();
427 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
429 for (int j = 0; j < num; j++) { //send filenames to process i
430 char tempDistFileName[1024];
431 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
432 int lengthDist = (dividedNames[i][j].begin()->first).length();
434 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
435 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
437 char tempNameFileName[1024];
438 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
439 int lengthName = (dividedNames[i][j].begin()->second).length();
441 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
442 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
447 listFileNames = cluster(dividedNames[0], labels);
449 //receive the other processes info
450 for(int i = 1; i < processors; i++) {
451 int num = dividedNames[i].size();
454 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
455 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
457 //send list filenames to root process
458 for (int j = 0; j < num; j++) {
460 char tempListFileName[1024];
462 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
463 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
465 string myListFileName = tempListFileName;
466 myListFileName = myListFileName.substr(0, lengthList);
468 listFileNames.push_back(myListFileName);
471 //send Labels to root process
473 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
475 for (int j = 0; j < numLabels; j++) {
479 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
480 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
482 string myLabel = tempLabel;
483 myLabel = myLabel.substr(0, lengthLabel);
485 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
489 }else { //you are a child process
490 vector < map<string, string> > myNames;
492 //recieve the files you need to process
493 //receive number of file pairs
495 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
499 for (int j = 0; j < num; j++) { //receive filenames to process
501 char tempDistFileName[1024];
503 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
504 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
506 string myDistFileName = tempDistFileName;
507 myDistFileName = myDistFileName.substr(0, lengthDist);
510 char tempNameFileName[1024];
512 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
513 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
515 string myNameFileName = tempNameFileName;
516 myNameFileName = myNameFileName.substr(0, lengthName);
519 myNames[j][myDistFileName] = myNameFileName;
523 listFileNames = cluster(myNames, labels);
526 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
528 //send list filenames to root process
529 for (int j = 0; j < num; j++) {
530 char tempListFileName[1024];
531 strcpy(tempListFileName, listFileNames[j].c_str());
532 int lengthList = listFileNames[j].length();
534 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
535 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
538 //send Labels to root process
539 int numLabels = labels.size();
540 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
542 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
544 strcpy(tempLabel, (*it).c_str());
545 int lengthLabel = (*it).length();
547 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
548 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
553 MPI_Barrier(MPI_COMM_WORLD);
557 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
559 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
561 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
562 dividedNames.resize(processors);
564 //for each file group figure out which process will complete it
565 //want to divide the load intelligently so the big files are spread between processes
566 for (int i = 0; i < distName.size(); i++) {
567 int processToAssign = (i+1) % processors;
568 if (processToAssign == 0) { processToAssign = processors; }
570 dividedNames[(processToAssign-1)].push_back(distName[i]);
573 //not lets reverse the order of ever other process, so we balance big files running with little ones
574 for (int i = 0; i < processors; i++) {
575 int remainder = ((i+1) % processors);
576 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
579 createProcesses(dividedNames);
581 if (m->control_pressed) { return 0; }
583 //get list of list file names from each process
584 for(int i=0;i<processors;i++){
585 string filename = toString(processIDS[i]) + ".temp";
587 m->openInputFile(filename, in);
589 in >> tag; m->gobble(in);
593 in >> tempName; m->gobble(in);
594 listFileNames.push_back(tempName);
597 remove((toString(processIDS[i]) + ".temp").c_str());
600 filename = toString(processIDS[i]) + ".temp.labels";
602 m->openInputFile(filename, in2);
605 in2 >> tempCutoff; m->gobble(in2);
606 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
610 in2 >> tempName; m->gobble(in2);
611 if (labels.count(tempName) == 0) { labels.insert(tempName); }
614 remove((toString(processIDS[i]) + ".temp.labels").c_str());
618 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
621 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
623 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
625 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
627 //****************** merge list file and create rabund and sabund files ******************************//
629 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
632 if (pid == 0) { //only process 0 merges
635 ListVector* listSingle;
636 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
638 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
640 mergeLists(listFileNames, labelBins, listSingle);
642 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
644 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
646 //set list file as new current listfile
648 itTypes = outputTypes.find("list");
649 if (itTypes != outputTypes.end()) {
650 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
653 //set rabund file as new current rabundfile
654 itTypes = outputTypes.find("rabund");
655 if (itTypes != outputTypes.end()) {
656 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
659 //set sabund file as new current sabundfile
660 itTypes = outputTypes.find("sabund");
661 if (itTypes != outputTypes.end()) {
662 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
665 m->mothurOutEndLine();
666 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
667 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
668 m->mothurOutEndLine();
671 } //only process 0 merges
674 MPI_Barrier(MPI_COMM_WORLD);
679 catch(exception& e) {
680 m->errorOut(e, "ClusterSplitCommand", "execute");
684 //**********************************************************************************************************************
685 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
688 map<float, int> labelBin;
689 vector<float> orderFloat;
693 if (singleton != "none") {
695 m->openInputFile(singleton, in);
697 string firstCol, secondCol;
698 listSingle = new ListVector();
700 in >> firstCol >> secondCol; m->gobble(in);
701 listSingle->push_back(secondCol);
704 remove(singleton.c_str());
706 numSingleBins = listSingle->getNumBins();
707 }else{ listSingle = NULL; numSingleBins = 0; }
709 //go through users set and make them floats so we can sort them
710 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
713 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
714 else if (*it == "unique") { temp = -1.0; }
716 if (temp <= cutoff) {
717 orderFloat.push_back(temp);
718 labelBin[temp] = numSingleBins; //initialize numbins
723 sort(orderFloat.begin(), orderFloat.end());
726 //get the list info from each file
727 for (int k = 0; k < listNames.size(); k++) {
729 if (m->control_pressed) {
730 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str()); }
731 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
735 InputData* input = new InputData(listNames[k], "list");
736 ListVector* list = input->getListVector();
737 string lastLabel = list->getLabel();
739 string filledInList = listNames[k] + "filledInTemp";
741 m->openOutputFile(filledInList, outFilled);
743 //for each label needed
744 for(int l = 0; l < orderFloat.size(); l++){
747 if (orderFloat[l] == -1) { thisLabel = "unique"; }
748 else { thisLabel = toString(orderFloat[l], length-1); }
750 //this file has reached the end
752 list = input->getListVector(lastLabel, true);
753 }else{ //do you have the distance, or do you need to fill in
756 if (list->getLabel() == "unique") { labelFloat = -1.0; }
757 else { convert(list->getLabel(), labelFloat); }
759 //check for missing labels
760 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
761 //if its bigger get last label, otherwise keep it
763 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
765 lastLabel = list->getLabel();
769 list->setLabel(thisLabel);
770 list->print(outFilled);
773 labelBin[orderFloat[l]] += list->getNumBins();
777 list = input->getListVector();
780 if (list != NULL) { delete list; }
784 remove(listNames[k].c_str());
785 rename(filledInList.c_str(), listNames[k].c_str());
790 catch(exception& e) {
791 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
795 //**********************************************************************************************************************
796 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
798 if (outputDir == "") { outputDir += m->hasPath(distfile); }
799 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
801 m->openOutputFile(fileroot+ tag + ".sabund", outSabund);
802 m->openOutputFile(fileroot+ tag + ".rabund", outRabund);
803 m->openOutputFile(fileroot+ tag + ".list", outList);
805 outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["list"].push_back(fileroot+ tag + ".list");
806 outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
807 outputNames.push_back(fileroot+ tag + ".list"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
809 map<float, int>::iterator itLabel;
811 //for each label needed
812 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
815 if (itLabel->first == -1) { thisLabel = "unique"; }
816 else { thisLabel = toString(itLabel->first, length-1); }
818 outList << thisLabel << '\t' << itLabel->second << '\t';
820 RAbundVector* rabund = new RAbundVector();
821 rabund->setLabel(thisLabel);
824 if (listSingle != NULL) {
825 for (int j = 0; j < listSingle->getNumBins(); j++) {
826 outList << listSingle->get(j) << '\t';
827 rabund->push_back(m->getNumNames(listSingle->get(j)));
831 //get the list info from each file
832 for (int k = 0; k < listNames.size(); k++) {
834 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); } delete rabund; return 0; }
836 InputData* input = new InputData(listNames[k], "list");
837 ListVector* list = input->getListVector(thisLabel);
839 //this file has reached the end
840 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
842 for (int j = 0; j < list->getNumBins(); j++) {
843 outList << list->get(j) << '\t';
844 rabund->push_back(m->getNumNames(list->get(j)));
851 SAbundVector sabund = rabund->getSAbundVector();
853 sabund.print(outSabund);
854 rabund->print(outRabund);
864 if (listSingle != NULL) { delete listSingle; }
866 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
870 catch(exception& e) {
871 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
876 //**********************************************************************************************************************
878 void ClusterSplitCommand::printData(ListVector* oldList){
880 string label = oldList->getLabel();
881 RAbundVector oldRAbund = oldList->getRAbundVector();
883 oldRAbund.setLabel(label);
884 if (m->isTrue(showabund)) {
885 oldRAbund.getSAbundVector().print(cout);
887 oldRAbund.print(outRabund);
888 oldRAbund.getSAbundVector().print(outSabund);
890 oldList->print(outList);
892 catch(exception& e) {
893 m->errorOut(e, "ClusterSplitCommand", "printData");
897 //**********************************************************************************************************************
898 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
901 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
906 //loop through and create all the processes you want
907 while (process != processors) {
911 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
915 vector<string> listFileNames = cluster(dividedNames[process], labels);
917 //write out names to file
918 string filename = toString(getpid()) + ".temp";
920 m->openOutputFile(filename, out);
922 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
927 filename = toString(getpid()) + ".temp.labels";
928 m->openOutputFile(filename, outLabels);
930 outLabels << cutoff << endl;
931 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
932 outLabels << (*it) << endl;
938 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
939 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
944 //force parent to wait until all the processes are done
945 for (int i=0;i<processors;i++) {
946 int temp = processIDS[i];
954 catch(exception& e) {
955 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
959 //**********************************************************************************************************************
961 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
964 SparseMatrix* matrix;
967 RAbundVector* rabund;
969 vector<string> listFileNames;
971 double smallestCutoff = cutoff;
973 //cluster each distance file
974 for (int i = 0; i < distNames.size(); i++) {
975 if (m->control_pressed) { return listFileNames; }
977 string thisNamefile = distNames[i].begin()->second;
978 string thisDistFile = distNames[i].begin()->first;
982 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
984 //output your files too
986 cout << endl << "Reading " << thisDistFile << endl;
990 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
992 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
993 read->setCutoff(cutoff);
995 NameAssignment* nameMap = new NameAssignment(thisNamefile);
999 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
1001 list = read->getListVector();
1003 matrix = read->getMatrix();
1010 //output your files too
1012 cout << endl << "Clustering " << thisDistFile << endl;
1016 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1018 rabund = new RAbundVector(list->getRAbundVector());
1021 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1022 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1023 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
1024 tag = cluster->getTag();
1026 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1027 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1030 m->openOutputFile(fileroot+ tag + ".list", listFile);
1032 listFileNames.push_back(fileroot+ tag + ".list");
1034 float previousDist = 0.00000;
1035 float rndPreviousDist = 0.00000;
1041 double saveCutoff = cutoff;
1043 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1045 if (m->control_pressed) { //clean up
1046 delete matrix; delete list; delete cluster; delete rabund;
1048 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1049 listFileNames.clear(); return listFileNames;
1052 cluster->update(saveCutoff);
1054 float dist = matrix->getSmallDist();
1057 rndDist = m->ceilDist(dist, precision);
1059 rndDist = m->roundDist(dist, precision);
1062 if(previousDist <= 0.0000 && dist != previousDist){
1063 oldList.setLabel("unique");
1064 oldList.print(listFile);
1065 if (labels.count("unique") == 0) { labels.insert("unique"); }
1067 else if(rndDist != rndPreviousDist){
1068 oldList.setLabel(toString(rndPreviousDist, length-1));
1069 oldList.print(listFile);
1070 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1073 previousDist = dist;
1074 rndPreviousDist = rndDist;
1079 if(previousDist <= 0.0000){
1080 oldList.setLabel("unique");
1081 oldList.print(listFile);
1082 if (labels.count("unique") == 0) { labels.insert("unique"); }
1084 else if(rndPreviousDist<cutoff){
1085 oldList.setLabel(toString(rndPreviousDist, length-1));
1086 oldList.print(listFile);
1087 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1090 delete matrix; delete list; delete cluster; delete rabund;
1093 if (m->control_pressed) { //clean up
1094 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1095 listFileNames.clear(); return listFileNames;
1098 remove(thisDistFile.c_str());
1099 remove(thisNamefile.c_str());
1101 if (saveCutoff != cutoff) {
1102 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1103 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1105 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1108 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1111 cutoff = smallestCutoff;
1113 return listFileNames;
1116 catch(exception& e) {
1117 m->errorOut(e, "ClusterSplitCommand", "cluster");
1123 //**********************************************************************************************************************
1125 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1130 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1135 string thisOutputDir = outputDir;
1136 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1137 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
1138 remove(outputFileName.c_str());
1141 for (int i = 0; i < distNames.size(); i++) {
1142 if (m->control_pressed) { return 0; }
1144 string thisDistFile = distNames[i].begin()->first;
1146 m->appendFiles(thisDistFile, outputFileName);
1149 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1159 catch(exception& e) {
1160 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1164 //**********************************************************************************************************************