2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
19 //**********************************************************************************************************************
20 vector<string> ClusterSplitCommand::setParameters(){
22 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
23 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
24 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
25 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
26 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
27 CommandParameter ptaxlevel("taxlevel", "Number", "", "1", "", "", "",false,false); parameters.push_back(ptaxlevel);
28 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
29 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
30 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
31 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
32 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
33 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
34 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
35 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
36 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
37 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
38 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
40 vector<string> myArray;
41 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
45 m->errorOut(e, "ClusterSplitCommand", "setParameters");
49 //**********************************************************************************************************************
50 string ClusterSplitCommand::getHelpString(){
52 string helpString = "";
53 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n";
54 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
55 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
56 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n";
57 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
58 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n";
59 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
60 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
61 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
62 helpString += "The name parameter allows you to enter your name file and is required if your distance file is in column format. \n";
63 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n";
64 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
65 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
66 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
67 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
68 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n";
69 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
71 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
73 helpString += "The cluster.split command should be in the following format: \n";
74 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
75 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
79 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
83 //**********************************************************************************************************************
84 ClusterSplitCommand::ClusterSplitCommand(){
86 abort = true; calledHelp = true;
88 vector<string> tempOutNames;
89 outputTypes["list"] = tempOutNames;
90 outputTypes["rabund"] = tempOutNames;
91 outputTypes["sabund"] = tempOutNames;
92 outputTypes["column"] = tempOutNames;
95 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
99 //**********************************************************************************************************************
100 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
101 ClusterSplitCommand::ClusterSplitCommand(string option) {
103 abort = false; calledHelp = false;
106 //allow user to run help
107 if(option == "help") { help(); abort = true; calledHelp = true; }
108 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
111 vector<string> myArray = setParameters();
113 OptionParser parser(option);
114 map<string,string> parameters = parser.getParameters();
116 ValidParameters validParameter("cluster.split");
118 //check to make sure all parameters are valid for command
119 map<string,string>::iterator it;
120 for (it = parameters.begin(); it != parameters.end(); it++) {
121 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
126 //initialize outputTypes
127 vector<string> tempOutNames;
128 outputTypes["list"] = tempOutNames;
129 outputTypes["rabund"] = tempOutNames;
130 outputTypes["sabund"] = tempOutNames;
131 outputTypes["column"] = tempOutNames;
133 //if the user changes the output directory command factory will send this info to us in the output parameter
134 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
136 //if the user changes the input directory command factory will send this info to us in the output parameter
137 string inputDir = validParameter.validFile(parameters, "inputdir", false);
138 if (inputDir == "not found"){ inputDir = ""; }
141 it = parameters.find("phylip");
142 //user has given a template file
143 if(it != parameters.end()){
144 path = m->hasPath(it->second);
145 //if the user has not given a path then, add inputdir. else leave path alone.
146 if (path == "") { parameters["phylip"] = inputDir + it->second; }
149 it = parameters.find("column");
150 //user has given a template file
151 if(it != parameters.end()){
152 path = m->hasPath(it->second);
153 //if the user has not given a path then, add inputdir. else leave path alone.
154 if (path == "") { parameters["column"] = inputDir + it->second; }
157 it = parameters.find("name");
158 //user has given a template file
159 if(it != parameters.end()){
160 path = m->hasPath(it->second);
161 //if the user has not given a path then, add inputdir. else leave path alone.
162 if (path == "") { parameters["name"] = inputDir + it->second; }
165 it = parameters.find("taxonomy");
166 //user has given a template file
167 if(it != parameters.end()){
168 path = m->hasPath(it->second);
169 //if the user has not given a path then, add inputdir. else leave path alone.
170 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
173 it = parameters.find("fasta");
174 //user has given a template file
175 if(it != parameters.end()){
176 path = m->hasPath(it->second);
177 //if the user has not given a path then, add inputdir. else leave path alone.
178 if (path == "") { parameters["fasta"] = inputDir + it->second; }
182 //check for required parameters
183 phylipfile = validParameter.validFile(parameters, "phylip", true);
184 if (phylipfile == "not open") { abort = true; }
185 else if (phylipfile == "not found") { phylipfile = ""; }
186 else { distfile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
188 columnfile = validParameter.validFile(parameters, "column", true);
189 if (columnfile == "not open") { abort = true; }
190 else if (columnfile == "not found") { columnfile = ""; }
191 else { distfile = columnfile; format = "column"; m->setColumnFile(columnfile); }
193 namefile = validParameter.validFile(parameters, "name", true);
194 if (namefile == "not open") { abort = true; }
195 else if (namefile == "not found") { namefile = ""; }
196 else { m->setNameFile(namefile); }
198 fastafile = validParameter.validFile(parameters, "fasta", true);
199 if (fastafile == "not open") { abort = true; }
200 else if (fastafile == "not found") { fastafile = ""; }
201 else { distfile = fastafile; splitmethod = "fasta"; m->setFastaFile(fastafile); }
203 taxFile = validParameter.validFile(parameters, "taxonomy", true);
204 if (taxFile == "not open") { abort = true; }
205 else if (taxFile == "not found") { taxFile = ""; }
206 else { m->setTaxonomyFile(taxFile); }
208 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
209 //is there are current file available for either of these?
210 //give priority to column, then phylip, then fasta
211 columnfile = m->getColumnFile();
212 if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
214 phylipfile = m->getPhylipFile();
215 if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
217 fastafile = m->getFastaFile();
218 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
220 m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
226 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
228 if (columnfile != "") {
229 if (namefile == "") {
230 namefile = m->getNameFile();
231 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
233 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine();
239 if (fastafile != "") {
241 taxFile = m->getTaxonomyFile();
242 if (taxFile != "") { m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
244 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
249 if (namefile == "") {
250 namefile = m->getNameFile();
251 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
253 m->mothurOut("You need to provide a namefile if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
259 //check for optional parameter and set defaults
260 // ...at some point should added some additional type checking...
261 //get user cutoff and precision or use defaults
263 temp = validParameter.validFile(parameters, "precision", false);
264 if (temp == "not found") { temp = "100"; }
265 //saves precision legnth for formatting below
266 length = temp.length();
267 convert(temp, precision);
269 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
270 hard = m->isTrue(temp);
272 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
273 large = m->isTrue(temp);
275 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
276 m->setProcessors(temp);
277 convert(temp, processors);
279 temp = validParameter.validFile(parameters, "splitmethod", false);
280 if (splitmethod != "fasta") {
281 if (temp == "not found") { splitmethod = "distance"; }
282 else { splitmethod = temp; }
285 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
286 convert(temp, cutoff);
287 cutoff += (5 / (precision * 10.0));
289 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "1"; }
290 convert(temp, taxLevelCutoff);
292 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "average"; }
294 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
295 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
297 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
298 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
300 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
302 showabund = validParameter.validFile(parameters, "showabund", false);
303 if (showabund == "not found") { showabund = "T"; }
305 timing = validParameter.validFile(parameters, "timing", false);
306 if (timing == "not found") { timing = "F"; }
310 catch(exception& e) {
311 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
316 //**********************************************************************************************************************
318 int ClusterSplitCommand::execute(){
321 if (abort == true) { if (calledHelp) { return 0; } return 2; }
324 vector<string> listFileNames;
326 string singletonName = "";
327 double saveCutoff = cutoff;
329 //****************** file prep work ******************************//
334 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
335 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
337 if (pid == 0) { //only process 0 converts and splits
341 //if user gave a phylip file convert to column file
342 if (format == "phylip") {
344 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
346 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
348 NameAssignment* nameMap = NULL;
349 convert->setFormat("phylip");
350 convert->read(nameMap);
352 if (m->control_pressed) { delete convert; return 0; }
354 distfile = convert->getOutputFile();
356 //if no names file given with phylip file, create it
357 ListVector* listToMakeNameFile = convert->getListVector();
358 if (namefile == "") { //you need to make a namefile for split matrix
360 namefile = phylipfile + ".names";
361 m->openOutputFile(namefile, out);
362 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
363 string bin = listToMakeNameFile->get(i);
364 out << bin << '\t' << bin << endl;
368 delete listToMakeNameFile;
371 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
373 if (m->control_pressed) { return 0; }
376 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
378 //split matrix into non-overlapping groups
380 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); }
381 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); }
382 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir); }
383 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
387 if (m->control_pressed) { delete split; return 0; }
389 singletonName = split->getSingletonNames();
390 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
393 //output a merged distance file
394 if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
397 if (m->control_pressed) { return 0; }
399 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
402 //****************** break up files between processes and cluster each file set ******************************//
404 ////you are process 0 from above////
406 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
407 dividedNames.resize(processors);
409 //for each file group figure out which process will complete it
410 //want to divide the load intelligently so the big files are spread between processes
411 for (int i = 0; i < distName.size(); i++) {
412 int processToAssign = (i+1) % processors;
413 if (processToAssign == 0) { processToAssign = processors; }
415 dividedNames[(processToAssign-1)].push_back(distName[i]);
418 //not lets reverse the order of ever other process, so we balance big files running with little ones
419 for (int i = 0; i < processors; i++) {
420 int remainder = ((i+1) % processors);
421 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
425 //send each child the list of files it needs to process
426 for(int i = 1; i < processors; i++) {
427 //send number of file pairs
428 int num = dividedNames[i].size();
429 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
431 for (int j = 0; j < num; j++) { //send filenames to process i
432 char tempDistFileName[1024];
433 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
434 int lengthDist = (dividedNames[i][j].begin()->first).length();
436 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
437 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
439 char tempNameFileName[1024];
440 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
441 int lengthName = (dividedNames[i][j].begin()->second).length();
443 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
444 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
449 listFileNames = cluster(dividedNames[0], labels);
451 //receive the other processes info
452 for(int i = 1; i < processors; i++) {
453 int num = dividedNames[i].size();
456 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
457 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
459 //send list filenames to root process
460 for (int j = 0; j < num; j++) {
462 char tempListFileName[1024];
464 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
465 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
467 string myListFileName = tempListFileName;
468 myListFileName = myListFileName.substr(0, lengthList);
470 listFileNames.push_back(myListFileName);
473 //send Labels to root process
475 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
477 for (int j = 0; j < numLabels; j++) {
481 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
482 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
484 string myLabel = tempLabel;
485 myLabel = myLabel.substr(0, lengthLabel);
487 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
491 }else { //you are a child process
492 vector < map<string, string> > myNames;
494 //recieve the files you need to process
495 //receive number of file pairs
497 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
501 for (int j = 0; j < num; j++) { //receive filenames to process
503 char tempDistFileName[1024];
505 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
506 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
508 string myDistFileName = tempDistFileName;
509 myDistFileName = myDistFileName.substr(0, lengthDist);
512 char tempNameFileName[1024];
514 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
515 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
517 string myNameFileName = tempNameFileName;
518 myNameFileName = myNameFileName.substr(0, lengthName);
521 myNames[j][myDistFileName] = myNameFileName;
525 listFileNames = cluster(myNames, labels);
528 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
530 //send list filenames to root process
531 for (int j = 0; j < num; j++) {
532 char tempListFileName[1024];
533 strcpy(tempListFileName, listFileNames[j].c_str());
534 int lengthList = listFileNames[j].length();
536 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
537 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
540 //send Labels to root process
541 int numLabels = labels.size();
542 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
544 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
546 strcpy(tempLabel, (*it).c_str());
547 int lengthLabel = (*it).length();
549 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
550 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
555 MPI_Barrier(MPI_COMM_WORLD);
559 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
561 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
563 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
564 dividedNames.resize(processors);
566 //for each file group figure out which process will complete it
567 //want to divide the load intelligently so the big files are spread between processes
568 for (int i = 0; i < distName.size(); i++) {
569 int processToAssign = (i+1) % processors;
570 if (processToAssign == 0) { processToAssign = processors; }
572 dividedNames[(processToAssign-1)].push_back(distName[i]);
575 //not lets reverse the order of ever other process, so we balance big files running with little ones
576 for (int i = 0; i < processors; i++) {
577 int remainder = ((i+1) % processors);
578 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
581 createProcesses(dividedNames);
583 if (m->control_pressed) { return 0; }
585 //get list of list file names from each process
586 for(int i=0;i<processors;i++){
587 string filename = toString(processIDS[i]) + ".temp";
589 m->openInputFile(filename, in);
591 in >> tag; m->gobble(in);
595 in >> tempName; m->gobble(in);
596 listFileNames.push_back(tempName);
599 remove((toString(processIDS[i]) + ".temp").c_str());
602 filename = toString(processIDS[i]) + ".temp.labels";
604 m->openInputFile(filename, in2);
607 in2 >> tempCutoff; m->gobble(in2);
608 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
612 in2 >> tempName; m->gobble(in2);
613 if (labels.count(tempName) == 0) { labels.insert(tempName); }
616 remove((toString(processIDS[i]) + ".temp.labels").c_str());
620 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
623 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
625 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
627 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
629 //****************** merge list file and create rabund and sabund files ******************************//
631 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
634 if (pid == 0) { //only process 0 merges
637 ListVector* listSingle;
638 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
640 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
642 mergeLists(listFileNames, labelBins, listSingle);
644 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
646 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
648 //set list file as new current listfile
650 itTypes = outputTypes.find("list");
651 if (itTypes != outputTypes.end()) {
652 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
655 //set rabund file as new current rabundfile
656 itTypes = outputTypes.find("rabund");
657 if (itTypes != outputTypes.end()) {
658 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
661 //set sabund file as new current sabundfile
662 itTypes = outputTypes.find("sabund");
663 if (itTypes != outputTypes.end()) {
664 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
667 m->mothurOutEndLine();
668 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
669 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
670 m->mothurOutEndLine();
673 } //only process 0 merges
676 MPI_Barrier(MPI_COMM_WORLD);
681 catch(exception& e) {
682 m->errorOut(e, "ClusterSplitCommand", "execute");
686 //**********************************************************************************************************************
687 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
690 map<float, int> labelBin;
691 vector<float> orderFloat;
695 if (singleton != "none") {
697 m->openInputFile(singleton, in);
699 string firstCol, secondCol;
700 listSingle = new ListVector();
702 in >> firstCol >> secondCol; m->gobble(in);
703 listSingle->push_back(secondCol);
706 remove(singleton.c_str());
708 numSingleBins = listSingle->getNumBins();
709 }else{ listSingle = NULL; numSingleBins = 0; }
711 //go through users set and make them floats so we can sort them
712 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
715 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
716 else if (*it == "unique") { temp = -1.0; }
718 if (temp <= cutoff) {
719 orderFloat.push_back(temp);
720 labelBin[temp] = numSingleBins; //initialize numbins
725 sort(orderFloat.begin(), orderFloat.end());
728 //get the list info from each file
729 for (int k = 0; k < listNames.size(); k++) {
731 if (m->control_pressed) {
732 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str()); }
733 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
737 InputData* input = new InputData(listNames[k], "list");
738 ListVector* list = input->getListVector();
739 string lastLabel = list->getLabel();
741 string filledInList = listNames[k] + "filledInTemp";
743 m->openOutputFile(filledInList, outFilled);
745 //for each label needed
746 for(int l = 0; l < orderFloat.size(); l++){
749 if (orderFloat[l] == -1) { thisLabel = "unique"; }
750 else { thisLabel = toString(orderFloat[l], length-1); }
752 //this file has reached the end
754 list = input->getListVector(lastLabel, true);
755 }else{ //do you have the distance, or do you need to fill in
758 if (list->getLabel() == "unique") { labelFloat = -1.0; }
759 else { convert(list->getLabel(), labelFloat); }
761 //check for missing labels
762 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
763 //if its bigger get last label, otherwise keep it
765 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
767 lastLabel = list->getLabel();
771 list->setLabel(thisLabel);
772 list->print(outFilled);
775 labelBin[orderFloat[l]] += list->getNumBins();
779 list = input->getListVector();
782 if (list != NULL) { delete list; }
786 remove(listNames[k].c_str());
787 rename(filledInList.c_str(), listNames[k].c_str());
792 catch(exception& e) {
793 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
797 //**********************************************************************************************************************
798 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
800 if (outputDir == "") { outputDir += m->hasPath(distfile); }
801 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
803 m->openOutputFile(fileroot+ tag + ".sabund", outSabund);
804 m->openOutputFile(fileroot+ tag + ".rabund", outRabund);
805 m->openOutputFile(fileroot+ tag + ".list", outList);
807 outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["list"].push_back(fileroot+ tag + ".list");
808 outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
809 outputNames.push_back(fileroot+ tag + ".list"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
811 map<float, int>::iterator itLabel;
813 //for each label needed
814 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
817 if (itLabel->first == -1) { thisLabel = "unique"; }
818 else { thisLabel = toString(itLabel->first, length-1); }
820 outList << thisLabel << '\t' << itLabel->second << '\t';
822 RAbundVector* rabund = new RAbundVector();
823 rabund->setLabel(thisLabel);
826 if (listSingle != NULL) {
827 for (int j = 0; j < listSingle->getNumBins(); j++) {
828 outList << listSingle->get(j) << '\t';
829 rabund->push_back(m->getNumNames(listSingle->get(j)));
833 //get the list info from each file
834 for (int k = 0; k < listNames.size(); k++) {
836 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); } delete rabund; return 0; }
838 InputData* input = new InputData(listNames[k], "list");
839 ListVector* list = input->getListVector(thisLabel);
841 //this file has reached the end
842 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
844 for (int j = 0; j < list->getNumBins(); j++) {
845 outList << list->get(j) << '\t';
846 rabund->push_back(m->getNumNames(list->get(j)));
853 SAbundVector sabund = rabund->getSAbundVector();
855 sabund.print(outSabund);
856 rabund->print(outRabund);
866 if (listSingle != NULL) { delete listSingle; }
868 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
872 catch(exception& e) {
873 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
878 //**********************************************************************************************************************
880 void ClusterSplitCommand::printData(ListVector* oldList){
882 string label = oldList->getLabel();
883 RAbundVector oldRAbund = oldList->getRAbundVector();
885 oldRAbund.setLabel(label);
886 if (m->isTrue(showabund)) {
887 oldRAbund.getSAbundVector().print(cout);
889 oldRAbund.print(outRabund);
890 oldRAbund.getSAbundVector().print(outSabund);
892 oldList->print(outList);
894 catch(exception& e) {
895 m->errorOut(e, "ClusterSplitCommand", "printData");
899 //**********************************************************************************************************************
900 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
903 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
908 //loop through and create all the processes you want
909 while (process != processors) {
913 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
917 vector<string> listFileNames = cluster(dividedNames[process], labels);
919 //write out names to file
920 string filename = toString(getpid()) + ".temp";
922 m->openOutputFile(filename, out);
924 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
929 filename = toString(getpid()) + ".temp.labels";
930 m->openOutputFile(filename, outLabels);
932 outLabels << cutoff << endl;
933 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
934 outLabels << (*it) << endl;
940 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
941 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
946 //force parent to wait until all the processes are done
947 for (int i=0;i<processors;i++) {
948 int temp = processIDS[i];
956 catch(exception& e) {
957 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
961 //**********************************************************************************************************************
963 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
966 SparseMatrix* matrix;
969 RAbundVector* rabund;
971 vector<string> listFileNames;
973 double smallestCutoff = cutoff;
975 //cluster each distance file
976 for (int i = 0; i < distNames.size(); i++) {
977 if (m->control_pressed) { return listFileNames; }
979 string thisNamefile = distNames[i].begin()->second;
980 string thisDistFile = distNames[i].begin()->first;
984 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
986 //output your files too
988 cout << endl << "Reading " << thisDistFile << endl;
992 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
994 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
995 read->setCutoff(cutoff);
997 NameAssignment* nameMap = new NameAssignment(thisNamefile);
1001 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
1003 list = read->getListVector();
1005 matrix = read->getMatrix();
1012 //output your files too
1014 cout << endl << "Clustering " << thisDistFile << endl;
1018 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1020 rabund = new RAbundVector(list->getRAbundVector());
1023 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1024 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1025 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
1026 tag = cluster->getTag();
1028 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1029 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1032 m->openOutputFile(fileroot+ tag + ".list", listFile);
1034 listFileNames.push_back(fileroot+ tag + ".list");
1036 float previousDist = 0.00000;
1037 float rndPreviousDist = 0.00000;
1043 double saveCutoff = cutoff;
1045 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1047 if (m->control_pressed) { //clean up
1048 delete matrix; delete list; delete cluster; delete rabund;
1050 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1051 listFileNames.clear(); return listFileNames;
1054 cluster->update(saveCutoff);
1056 float dist = matrix->getSmallDist();
1059 rndDist = m->ceilDist(dist, precision);
1061 rndDist = m->roundDist(dist, precision);
1064 if(previousDist <= 0.0000 && dist != previousDist){
1065 oldList.setLabel("unique");
1066 oldList.print(listFile);
1067 if (labels.count("unique") == 0) { labels.insert("unique"); }
1069 else if(rndDist != rndPreviousDist){
1070 oldList.setLabel(toString(rndPreviousDist, length-1));
1071 oldList.print(listFile);
1072 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1075 previousDist = dist;
1076 rndPreviousDist = rndDist;
1081 if(previousDist <= 0.0000){
1082 oldList.setLabel("unique");
1083 oldList.print(listFile);
1084 if (labels.count("unique") == 0) { labels.insert("unique"); }
1086 else if(rndPreviousDist<cutoff){
1087 oldList.setLabel(toString(rndPreviousDist, length-1));
1088 oldList.print(listFile);
1089 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1092 delete matrix; delete list; delete cluster; delete rabund;
1095 if (m->control_pressed) { //clean up
1096 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1097 listFileNames.clear(); return listFileNames;
1100 remove(thisDistFile.c_str());
1101 remove(thisNamefile.c_str());
1103 if (saveCutoff != cutoff) {
1104 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1105 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1107 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1110 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1113 cutoff = smallestCutoff;
1115 return listFileNames;
1118 catch(exception& e) {
1119 m->errorOut(e, "ClusterSplitCommand", "cluster");
1125 //**********************************************************************************************************************
1127 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1132 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1137 string thisOutputDir = outputDir;
1138 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1139 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
1140 remove(outputFileName.c_str());
1143 for (int i = 0; i < distNames.size(); i++) {
1144 if (m->control_pressed) { return 0; }
1146 string thisDistFile = distNames[i].begin()->first;
1148 m->appendFiles(thisDistFile, outputFileName);
1151 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1161 catch(exception& e) {
1162 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1166 //**********************************************************************************************************************