2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
19 //**********************************************************************************************************************
20 vector<string> ClusterSplitCommand::setParameters(){
22 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
23 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
24 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
25 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
26 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
27 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
28 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
29 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
30 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
31 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
32 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
33 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff);
34 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
35 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
36 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
37 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
38 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
40 vector<string> myArray;
41 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
45 m->errorOut(e, "ClusterSplitCommand", "setParameters");
49 //**********************************************************************************************************************
50 string ClusterSplitCommand::getHelpString(){
52 string helpString = "";
53 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n";
54 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
55 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
56 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n";
57 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
58 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n";
59 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
60 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
61 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
62 helpString += "The name parameter allows you to enter your name file and is required if your distance file is in column format. \n";
63 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
64 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
65 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
66 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
67 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
68 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
69 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
71 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
73 helpString += "The cluster.split command should be in the following format: \n";
74 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
75 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
79 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
83 //**********************************************************************************************************************
84 ClusterSplitCommand::ClusterSplitCommand(){
86 abort = true; calledHelp = true;
88 vector<string> tempOutNames;
89 outputTypes["list"] = tempOutNames;
90 outputTypes["rabund"] = tempOutNames;
91 outputTypes["sabund"] = tempOutNames;
92 outputTypes["column"] = tempOutNames;
95 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
99 //**********************************************************************************************************************
100 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
101 ClusterSplitCommand::ClusterSplitCommand(string option) {
103 abort = false; calledHelp = false;
106 //allow user to run help
107 if(option == "help") { help(); abort = true; calledHelp = true; }
108 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
111 vector<string> myArray = setParameters();
113 OptionParser parser(option);
114 map<string,string> parameters = parser.getParameters();
116 ValidParameters validParameter("cluster.split");
118 //check to make sure all parameters are valid for command
119 map<string,string>::iterator it;
120 for (it = parameters.begin(); it != parameters.end(); it++) {
121 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
126 //initialize outputTypes
127 vector<string> tempOutNames;
128 outputTypes["list"] = tempOutNames;
129 outputTypes["rabund"] = tempOutNames;
130 outputTypes["sabund"] = tempOutNames;
131 outputTypes["column"] = tempOutNames;
133 //if the user changes the output directory command factory will send this info to us in the output parameter
134 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
136 //if the user changes the input directory command factory will send this info to us in the output parameter
137 string inputDir = validParameter.validFile(parameters, "inputdir", false);
138 if (inputDir == "not found"){ inputDir = ""; }
141 it = parameters.find("phylip");
142 //user has given a template file
143 if(it != parameters.end()){
144 path = m->hasPath(it->second);
145 //if the user has not given a path then, add inputdir. else leave path alone.
146 if (path == "") { parameters["phylip"] = inputDir + it->second; }
149 it = parameters.find("column");
150 //user has given a template file
151 if(it != parameters.end()){
152 path = m->hasPath(it->second);
153 //if the user has not given a path then, add inputdir. else leave path alone.
154 if (path == "") { parameters["column"] = inputDir + it->second; }
157 it = parameters.find("name");
158 //user has given a template file
159 if(it != parameters.end()){
160 path = m->hasPath(it->second);
161 //if the user has not given a path then, add inputdir. else leave path alone.
162 if (path == "") { parameters["name"] = inputDir + it->second; }
165 it = parameters.find("taxonomy");
166 //user has given a template file
167 if(it != parameters.end()){
168 path = m->hasPath(it->second);
169 //if the user has not given a path then, add inputdir. else leave path alone.
170 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
173 it = parameters.find("fasta");
174 //user has given a template file
175 if(it != parameters.end()){
176 path = m->hasPath(it->second);
177 //if the user has not given a path then, add inputdir. else leave path alone.
178 if (path == "") { parameters["fasta"] = inputDir + it->second; }
182 //check for required parameters
183 phylipfile = validParameter.validFile(parameters, "phylip", true);
184 if (phylipfile == "not open") { abort = true; }
185 else if (phylipfile == "not found") { phylipfile = ""; }
186 else { distfile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
188 columnfile = validParameter.validFile(parameters, "column", true);
189 if (columnfile == "not open") { abort = true; }
190 else if (columnfile == "not found") { columnfile = ""; }
191 else { distfile = columnfile; format = "column"; m->setColumnFile(columnfile); }
193 namefile = validParameter.validFile(parameters, "name", true);
194 if (namefile == "not open") { abort = true; }
195 else if (namefile == "not found") { namefile = ""; }
196 else { m->setNameFile(namefile); }
198 fastafile = validParameter.validFile(parameters, "fasta", true);
199 if (fastafile == "not open") { abort = true; }
200 else if (fastafile == "not found") { fastafile = ""; }
201 else { distfile = fastafile; splitmethod = "fasta"; m->setFastaFile(fastafile); }
203 taxFile = validParameter.validFile(parameters, "taxonomy", true);
204 if (taxFile == "not open") { abort = true; }
205 else if (taxFile == "not found") { taxFile = ""; }
206 else { m->setTaxonomyFile(taxFile); }
208 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
209 //is there are current file available for either of these?
210 //give priority to column, then phylip, then fasta
211 columnfile = m->getColumnFile();
212 if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
214 phylipfile = m->getPhylipFile();
215 if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
217 fastafile = m->getFastaFile();
218 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
220 m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
226 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
228 if (columnfile != "") {
229 if (namefile == "") {
230 namefile = m->getNameFile();
231 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
233 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine();
239 if (fastafile != "") {
241 taxFile = m->getTaxonomyFile();
242 if (taxFile != "") { m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
244 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
249 if (namefile == "") {
250 namefile = m->getNameFile();
251 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
253 m->mothurOut("You need to provide a namefile if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
259 //check for optional parameter and set defaults
260 // ...at some point should added some additional type checking...
261 //get user cutoff and precision or use defaults
263 temp = validParameter.validFile(parameters, "precision", false);
264 if (temp == "not found") { temp = "100"; }
265 //saves precision legnth for formatting below
266 length = temp.length();
267 m->mothurConvert(temp, precision);
269 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
270 hard = m->isTrue(temp);
272 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
273 large = m->isTrue(temp);
275 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
276 m->setProcessors(temp);
277 m->mothurConvert(temp, processors);
279 temp = validParameter.validFile(parameters, "splitmethod", false);
280 if (splitmethod != "fasta") {
281 if (temp == "not found") { splitmethod = "distance"; }
282 else { splitmethod = temp; }
285 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.25"; }
286 m->mothurConvert(temp, cutoff);
287 cutoff += (5 / (precision * 10.0));
289 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "3"; }
290 m->mothurConvert(temp, taxLevelCutoff);
292 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "average"; }
294 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
295 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
297 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
298 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
300 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
302 showabund = validParameter.validFile(parameters, "showabund", false);
303 if (showabund == "not found") { showabund = "T"; }
305 timing = validParameter.validFile(parameters, "timing", false);
306 if (timing == "not found") { timing = "F"; }
310 catch(exception& e) {
311 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
316 //**********************************************************************************************************************
318 int ClusterSplitCommand::execute(){
321 if (abort == true) { if (calledHelp) { return 0; } return 2; }
324 vector<string> listFileNames;
326 string singletonName = "";
327 double saveCutoff = cutoff;
329 //****************** file prep work ******************************//
334 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
335 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
337 if (pid == 0) { //only process 0 converts and splits
341 //if user gave a phylip file convert to column file
342 if (format == "phylip") {
344 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
346 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
348 NameAssignment* nameMap = NULL;
349 convert->setFormat("phylip");
350 convert->read(nameMap);
352 if (m->control_pressed) { delete convert; return 0; }
354 distfile = convert->getOutputFile();
356 //if no names file given with phylip file, create it
357 ListVector* listToMakeNameFile = convert->getListVector();
358 if (namefile == "") { //you need to make a namefile for split matrix
360 namefile = phylipfile + ".names";
361 m->openOutputFile(namefile, out);
362 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
363 string bin = listToMakeNameFile->get(i);
364 out << bin << '\t' << bin << endl;
368 delete listToMakeNameFile;
371 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
373 if (m->control_pressed) { return 0; }
376 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
378 //split matrix into non-overlapping groups
380 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); }
381 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); }
382 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir); }
383 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
387 if (m->control_pressed) { delete split; return 0; }
389 singletonName = split->getSingletonNames();
390 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
393 //output a merged distance file
394 if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
397 if (m->control_pressed) { return 0; }
399 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
402 //****************** break up files between processes and cluster each file set ******************************//
404 ////you are process 0 from above////
406 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
407 dividedNames.resize(processors);
409 //for each file group figure out which process will complete it
410 //want to divide the load intelligently so the big files are spread between processes
411 for (int i = 0; i < distName.size(); i++) {
412 int processToAssign = (i+1) % processors;
413 if (processToAssign == 0) { processToAssign = processors; }
415 dividedNames[(processToAssign-1)].push_back(distName[i]);
418 //not lets reverse the order of ever other process, so we balance big files running with little ones
419 for (int i = 0; i < processors; i++) {
420 int remainder = ((i+1) % processors);
421 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
425 //send each child the list of files it needs to process
426 for(int i = 1; i < processors; i++) {
427 //send number of file pairs
428 int num = dividedNames[i].size();
429 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
431 for (int j = 0; j < num; j++) { //send filenames to process i
432 char tempDistFileName[1024];
433 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
434 int lengthDist = (dividedNames[i][j].begin()->first).length();
436 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
437 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
439 char tempNameFileName[1024];
440 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
441 int lengthName = (dividedNames[i][j].begin()->second).length();
443 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
444 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
449 listFileNames = cluster(dividedNames[0], labels);
451 //receive the other processes info
452 for(int i = 1; i < processors; i++) {
453 int num = dividedNames[i].size();
456 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
457 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
459 //send list filenames to root process
460 for (int j = 0; j < num; j++) {
462 char tempListFileName[1024];
464 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
465 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
467 string myListFileName = tempListFileName;
468 myListFileName = myListFileName.substr(0, lengthList);
470 listFileNames.push_back(myListFileName);
473 //send Labels to root process
475 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
477 for (int j = 0; j < numLabels; j++) {
481 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
482 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
484 string myLabel = tempLabel;
485 myLabel = myLabel.substr(0, lengthLabel);
487 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
491 }else { //you are a child process
492 vector < map<string, string> > myNames;
494 //recieve the files you need to process
495 //receive number of file pairs
497 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
501 for (int j = 0; j < num; j++) { //receive filenames to process
503 char tempDistFileName[1024];
505 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
506 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
508 string myDistFileName = tempDistFileName;
509 myDistFileName = myDistFileName.substr(0, lengthDist);
512 char tempNameFileName[1024];
514 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
515 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
517 string myNameFileName = tempNameFileName;
518 myNameFileName = myNameFileName.substr(0, lengthName);
521 myNames[j][myDistFileName] = myNameFileName;
525 listFileNames = cluster(myNames, labels);
528 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
530 //send list filenames to root process
531 for (int j = 0; j < num; j++) {
532 char tempListFileName[1024];
533 strcpy(tempListFileName, listFileNames[j].c_str());
534 int lengthList = listFileNames[j].length();
536 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
537 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
540 //send Labels to root process
541 int numLabels = labels.size();
542 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
544 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
546 strcpy(tempLabel, (*it).c_str());
547 int lengthLabel = (*it).length();
549 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
550 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
555 MPI_Barrier(MPI_COMM_WORLD);
560 if (processors > distName.size()) { processors = distName.size(); }
562 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
564 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
567 //cout << processors << '\t' << distName.size() << endl;
568 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
569 dividedNames.resize(processors);
571 //for each file group figure out which process will complete it
572 //want to divide the load intelligently so the big files are spread between processes
573 for (int i = 0; i < distName.size(); i++) {
575 int processToAssign = (i+1) % processors;
576 if (processToAssign == 0) { processToAssign = processors; }
578 dividedNames[(processToAssign-1)].push_back(distName[i]);
581 //not lets reverse the order of ever other process, so we balance big files running with little ones
582 for (int i = 0; i < processors; i++) {
584 int remainder = ((i+1) % processors);
585 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
588 createProcesses(dividedNames);
590 if (m->control_pressed) { return 0; }
592 //get list of list file names from each process
593 for(int i=0;i<processors;i++){
594 string filename = toString(processIDS[i]) + ".temp";
596 m->openInputFile(filename, in);
598 in >> tag; m->gobble(in);
602 in >> tempName; m->gobble(in);
603 listFileNames.push_back(tempName);
606 m->mothurRemove((toString(processIDS[i]) + ".temp"));
609 filename = toString(processIDS[i]) + ".temp.labels";
611 m->openInputFile(filename, in2);
614 in2 >> tempCutoff; m->gobble(in2);
615 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
619 in2 >> tempName; m->gobble(in2);
620 if (labels.count(tempName) == 0) { labels.insert(tempName); }
623 m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
627 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
630 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
632 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
634 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
636 //****************** merge list file and create rabund and sabund files ******************************//
638 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
641 if (pid == 0) { //only process 0 merges
644 ListVector* listSingle;
645 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
647 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
649 mergeLists(listFileNames, labelBins, listSingle);
651 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
653 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
655 //set list file as new current listfile
657 itTypes = outputTypes.find("list");
658 if (itTypes != outputTypes.end()) {
659 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
662 //set rabund file as new current rabundfile
663 itTypes = outputTypes.find("rabund");
664 if (itTypes != outputTypes.end()) {
665 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
668 //set sabund file as new current sabundfile
669 itTypes = outputTypes.find("sabund");
670 if (itTypes != outputTypes.end()) {
671 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
674 m->mothurOutEndLine();
675 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
676 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
677 m->mothurOutEndLine();
680 } //only process 0 merges
683 MPI_Barrier(MPI_COMM_WORLD);
688 catch(exception& e) {
689 m->errorOut(e, "ClusterSplitCommand", "execute");
693 //**********************************************************************************************************************
694 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
697 map<float, int> labelBin;
698 vector<float> orderFloat;
702 if (singleton != "none") {
704 m->openInputFile(singleton, in);
706 string firstCol, secondCol;
707 listSingle = new ListVector();
709 in >> firstCol >> secondCol; m->gobble(in);
710 listSingle->push_back(secondCol);
713 m->mothurRemove(singleton);
715 numSingleBins = listSingle->getNumBins();
716 }else{ listSingle = NULL; numSingleBins = 0; }
718 //go through users set and make them floats so we can sort them
719 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
722 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
723 else if (*it == "unique") { temp = -1.0; }
725 if (temp <= cutoff) {
726 orderFloat.push_back(temp);
727 labelBin[temp] = numSingleBins; //initialize numbins
732 sort(orderFloat.begin(), orderFloat.end());
735 //get the list info from each file
736 for (int k = 0; k < listNames.size(); k++) {
738 if (m->control_pressed) {
739 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton); }
740 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
744 InputData* input = new InputData(listNames[k], "list");
745 ListVector* list = input->getListVector();
746 string lastLabel = list->getLabel();
748 string filledInList = listNames[k] + "filledInTemp";
750 m->openOutputFile(filledInList, outFilled);
752 //for each label needed
753 for(int l = 0; l < orderFloat.size(); l++){
756 if (orderFloat[l] == -1) { thisLabel = "unique"; }
757 else { thisLabel = toString(orderFloat[l], length-1); }
759 //this file has reached the end
761 list = input->getListVector(lastLabel, true);
762 }else{ //do you have the distance, or do you need to fill in
765 if (list->getLabel() == "unique") { labelFloat = -1.0; }
766 else { convert(list->getLabel(), labelFloat); }
768 //check for missing labels
769 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
770 //if its bigger get last label, otherwise keep it
772 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
774 lastLabel = list->getLabel();
778 list->setLabel(thisLabel);
779 list->print(outFilled);
782 labelBin[orderFloat[l]] += list->getNumBins();
786 list = input->getListVector();
789 if (list != NULL) { delete list; }
793 m->mothurRemove(listNames[k]);
794 rename(filledInList.c_str(), listNames[k].c_str());
799 catch(exception& e) {
800 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
804 //**********************************************************************************************************************
805 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
807 if (outputDir == "") { outputDir += m->hasPath(distfile); }
808 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
810 m->openOutputFile(fileroot+ tag + ".sabund", outSabund);
811 m->openOutputFile(fileroot+ tag + ".rabund", outRabund);
812 m->openOutputFile(fileroot+ tag + ".list", outList);
814 outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["list"].push_back(fileroot+ tag + ".list");
815 outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
816 outputNames.push_back(fileroot+ tag + ".list"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
818 map<float, int>::iterator itLabel;
820 //for each label needed
821 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
824 if (itLabel->first == -1) { thisLabel = "unique"; }
825 else { thisLabel = toString(itLabel->first, length-1); }
827 outList << thisLabel << '\t' << itLabel->second << '\t';
829 RAbundVector* rabund = new RAbundVector();
830 rabund->setLabel(thisLabel);
833 if (listSingle != NULL) {
834 for (int j = 0; j < listSingle->getNumBins(); j++) {
835 outList << listSingle->get(j) << '\t';
836 rabund->push_back(m->getNumNames(listSingle->get(j)));
840 //get the list info from each file
841 for (int k = 0; k < listNames.size(); k++) {
843 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } delete rabund; return 0; }
845 InputData* input = new InputData(listNames[k], "list");
846 ListVector* list = input->getListVector(thisLabel);
848 //this file has reached the end
849 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
851 for (int j = 0; j < list->getNumBins(); j++) {
852 outList << list->get(j) << '\t';
853 rabund->push_back(m->getNumNames(list->get(j)));
860 SAbundVector sabund = rabund->getSAbundVector();
862 sabund.print(outSabund);
863 rabund->print(outRabund);
873 if (listSingle != NULL) { delete listSingle; }
875 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
879 catch(exception& e) {
880 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
885 //**********************************************************************************************************************
887 void ClusterSplitCommand::printData(ListVector* oldList){
889 string label = oldList->getLabel();
890 RAbundVector oldRAbund = oldList->getRAbundVector();
892 oldRAbund.setLabel(label);
893 if (m->isTrue(showabund)) {
894 oldRAbund.getSAbundVector().print(cout);
896 oldRAbund.print(outRabund);
897 oldRAbund.getSAbundVector().print(outSabund);
899 oldList->print(outList);
901 catch(exception& e) {
902 m->errorOut(e, "ClusterSplitCommand", "printData");
906 //**********************************************************************************************************************
907 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
910 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
915 //loop through and create all the processes you want
916 while (process != processors) {
920 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
924 vector<string> listFileNames = cluster(dividedNames[process], labels);
926 //write out names to file
927 string filename = toString(getpid()) + ".temp";
929 m->openOutputFile(filename, out);
931 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
936 filename = toString(getpid()) + ".temp.labels";
937 m->openOutputFile(filename, outLabels);
939 outLabels << cutoff << endl;
940 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
941 outLabels << (*it) << endl;
947 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
948 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
953 //force parent to wait until all the processes are done
954 for (int i=0;i<processors;i++) {
955 int temp = processIDS[i];
963 catch(exception& e) {
964 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
968 //**********************************************************************************************************************
970 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
973 SparseMatrix* matrix;
976 RAbundVector* rabund;
978 vector<string> listFileNames;
980 double smallestCutoff = cutoff;
982 //cluster each distance file
983 for (int i = 0; i < distNames.size(); i++) {
984 if (m->control_pressed) { return listFileNames; }
986 string thisNamefile = distNames[i].begin()->second;
987 string thisDistFile = distNames[i].begin()->first;
991 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
993 //output your files too
995 cout << endl << "Reading " << thisDistFile << endl;
999 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1001 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1002 read->setCutoff(cutoff);
1004 NameAssignment* nameMap = new NameAssignment(thisNamefile);
1006 read->read(nameMap);
1008 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
1010 list = read->getListVector();
1012 matrix = read->getMatrix();
1019 //output your files too
1021 cout << endl << "Clustering " << thisDistFile << endl;
1025 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1027 rabund = new RAbundVector(list->getRAbundVector());
1030 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1031 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1032 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
1033 tag = cluster->getTag();
1035 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1036 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1039 m->openOutputFile(fileroot+ tag + ".list", listFile);
1041 listFileNames.push_back(fileroot+ tag + ".list");
1043 float previousDist = 0.00000;
1044 float rndPreviousDist = 0.00000;
1050 double saveCutoff = cutoff;
1052 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1054 if (m->control_pressed) { //clean up
1055 delete matrix; delete list; delete cluster; delete rabund;
1057 for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); }
1058 listFileNames.clear(); return listFileNames;
1061 cluster->update(saveCutoff);
1063 float dist = matrix->getSmallDist();
1066 rndDist = m->ceilDist(dist, precision);
1068 rndDist = m->roundDist(dist, precision);
1071 if(previousDist <= 0.0000 && dist != previousDist){
1072 oldList.setLabel("unique");
1073 oldList.print(listFile);
1074 if (labels.count("unique") == 0) { labels.insert("unique"); }
1076 else if(rndDist != rndPreviousDist){
1077 oldList.setLabel(toString(rndPreviousDist, length-1));
1078 oldList.print(listFile);
1079 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1082 previousDist = dist;
1083 rndPreviousDist = rndDist;
1088 if(previousDist <= 0.0000){
1089 oldList.setLabel("unique");
1090 oldList.print(listFile);
1091 if (labels.count("unique") == 0) { labels.insert("unique"); }
1093 else if(rndPreviousDist<cutoff){
1094 oldList.setLabel(toString(rndPreviousDist, length-1));
1095 oldList.print(listFile);
1096 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1099 delete matrix; delete list; delete cluster; delete rabund;
1102 if (m->control_pressed) { //clean up
1103 for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); }
1104 listFileNames.clear(); return listFileNames;
1107 m->mothurRemove(thisDistFile);
1108 m->mothurRemove(thisNamefile);
1110 if (saveCutoff != cutoff) {
1111 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1112 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1114 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1117 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1120 cutoff = smallestCutoff;
1122 return listFileNames;
1125 catch(exception& e) {
1126 m->errorOut(e, "ClusterSplitCommand", "cluster");
1132 //**********************************************************************************************************************
1134 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1139 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1144 string thisOutputDir = outputDir;
1145 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1146 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
1147 m->mothurRemove(outputFileName);
1150 for (int i = 0; i < distNames.size(); i++) {
1151 if (m->control_pressed) { return 0; }
1153 string thisDistFile = distNames[i].begin()->first;
1155 m->appendFiles(thisDistFile, outputFileName);
1158 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1168 catch(exception& e) {
1169 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1173 //**********************************************************************************************************************