2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
13 //**********************************************************************************************************************
14 vector<string> ClusterSplitCommand::setParameters(){
16 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
17 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
18 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
19 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
20 CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "",false,false); parameters.push_back(pcount);
21 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
22 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
23 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
24 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
25 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
26 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
27 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
28 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff);
29 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
30 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
31 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
32 CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pclassic);
33 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
34 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
36 vector<string> myArray;
37 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
41 m->errorOut(e, "ClusterSplitCommand", "setParameters");
45 //**********************************************************************************************************************
46 string ClusterSplitCommand::getHelpString(){
48 string helpString = "";
49 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n";
50 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
51 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
52 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n";
53 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
54 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n";
55 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
56 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
57 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
58 helpString += "The name parameter allows you to enter your name file. \n";
59 helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
60 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
61 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
62 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
63 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
64 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
65 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
66 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
67 helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic. It is only valid with splitmethod=fasta. Default=f.\n";
69 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
71 helpString += "The cluster.split command should be in the following format: \n";
72 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
73 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
77 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
81 //**********************************************************************************************************************
82 string ClusterSplitCommand::getOutputFileNameTag(string type, string inputName=""){
84 string outputFileName = "";
85 map<string, vector<string> >::iterator it;
87 //is this a type this command creates
88 it = outputTypes.find(type);
89 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
91 if (type == "list") { outputFileName = "list"; }
92 else if (type == "rabund") { outputFileName = "rabund"; }
93 else if (type == "sabund") { outputFileName = "sabund"; }
94 else if (type == "column") { outputFileName = "dist"; }
95 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
97 return outputFileName;
100 m->errorOut(e, "ClusterSplitCommand", "getOutputFileNameTag");
104 //**********************************************************************************************************************
105 ClusterSplitCommand::ClusterSplitCommand(){
107 abort = true; calledHelp = true;
109 vector<string> tempOutNames;
110 outputTypes["list"] = tempOutNames;
111 outputTypes["rabund"] = tempOutNames;
112 outputTypes["sabund"] = tempOutNames;
113 outputTypes["column"] = tempOutNames;
115 catch(exception& e) {
116 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
120 //**********************************************************************************************************************
121 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
122 ClusterSplitCommand::ClusterSplitCommand(string option) {
124 abort = false; calledHelp = false;
127 //allow user to run help
128 if(option == "help") { help(); abort = true; calledHelp = true; }
129 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
132 vector<string> myArray = setParameters();
134 OptionParser parser(option);
135 map<string,string> parameters = parser.getParameters();
137 ValidParameters validParameter("cluster.split");
139 //check to make sure all parameters are valid for command
140 map<string,string>::iterator it;
141 for (it = parameters.begin(); it != parameters.end(); it++) {
142 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
147 //initialize outputTypes
148 vector<string> tempOutNames;
149 outputTypes["list"] = tempOutNames;
150 outputTypes["rabund"] = tempOutNames;
151 outputTypes["sabund"] = tempOutNames;
152 outputTypes["column"] = tempOutNames;
154 //if the user changes the output directory command factory will send this info to us in the output parameter
155 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
157 //if the user changes the input directory command factory will send this info to us in the output parameter
158 string inputDir = validParameter.validFile(parameters, "inputdir", false);
159 if (inputDir == "not found"){ inputDir = ""; }
162 it = parameters.find("phylip");
163 //user has given a template file
164 if(it != parameters.end()){
165 path = m->hasPath(it->second);
166 //if the user has not given a path then, add inputdir. else leave path alone.
167 if (path == "") { parameters["phylip"] = inputDir + it->second; }
170 it = parameters.find("column");
171 //user has given a template file
172 if(it != parameters.end()){
173 path = m->hasPath(it->second);
174 //if the user has not given a path then, add inputdir. else leave path alone.
175 if (path == "") { parameters["column"] = inputDir + it->second; }
178 it = parameters.find("name");
179 //user has given a template file
180 if(it != parameters.end()){
181 path = m->hasPath(it->second);
182 //if the user has not given a path then, add inputdir. else leave path alone.
183 if (path == "") { parameters["name"] = inputDir + it->second; }
186 it = parameters.find("taxonomy");
187 //user has given a template file
188 if(it != parameters.end()){
189 path = m->hasPath(it->second);
190 //if the user has not given a path then, add inputdir. else leave path alone.
191 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
194 it = parameters.find("fasta");
195 //user has given a template file
196 if(it != parameters.end()){
197 path = m->hasPath(it->second);
198 //if the user has not given a path then, add inputdir. else leave path alone.
199 if (path == "") { parameters["fasta"] = inputDir + it->second; }
202 it = parameters.find("count");
203 //user has given a template file
204 if(it != parameters.end()){
205 path = m->hasPath(it->second);
206 //if the user has not given a path then, add inputdir. else leave path alone.
207 if (path == "") { parameters["count"] = inputDir + it->second; }
211 //check for required parameters
212 phylipfile = validParameter.validFile(parameters, "phylip", true);
213 if (phylipfile == "not open") { abort = true; }
214 else if (phylipfile == "not found") { phylipfile = ""; }
215 else { distfile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
217 columnfile = validParameter.validFile(parameters, "column", true);
218 if (columnfile == "not open") { abort = true; }
219 else if (columnfile == "not found") { columnfile = ""; }
220 else { distfile = columnfile; format = "column"; m->setColumnFile(columnfile); }
222 namefile = validParameter.validFile(parameters, "name", true);
223 if (namefile == "not open") { abort = true; namefile = "";}
224 else if (namefile == "not found") { namefile = ""; }
225 else { m->setNameFile(namefile); }
227 countfile = validParameter.validFile(parameters, "count", true);
228 if (countfile == "not open") { abort = true; countfile = "";}
229 else if (countfile == "not found") { countfile = ""; }
230 else { m->setCountTableFile(countfile); }
232 fastafile = validParameter.validFile(parameters, "fasta", true);
233 if (fastafile == "not open") { abort = true; }
234 else if (fastafile == "not found") { fastafile = ""; }
235 else { distfile = fastafile; splitmethod = "fasta"; m->setFastaFile(fastafile); }
237 taxFile = validParameter.validFile(parameters, "taxonomy", true);
238 if (taxFile == "not open") { taxFile = ""; abort = true; }
239 else if (taxFile == "not found") { taxFile = ""; }
240 else { m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
242 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
243 //is there are current file available for either of these?
244 //give priority to column, then phylip, then fasta
245 columnfile = m->getColumnFile();
246 if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
248 phylipfile = m->getPhylipFile();
249 if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
251 fastafile = m->getFastaFile();
252 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
254 m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
260 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
262 if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
264 if (columnfile != "") {
265 if ((namefile == "") && (countfile == "")) {
266 namefile = m->getNameFile();
267 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
269 countfile = m->getCountTableFile();
270 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
272 m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
279 if (fastafile != "") {
281 taxFile = m->getTaxonomyFile();
282 if (taxFile != "") { m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
284 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
289 if ((namefile == "") && (countfile == "")) {
290 namefile = m->getNameFile();
291 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
293 countfile = m->getCountTableFile();
294 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
296 m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine();
303 //check for optional parameter and set defaults
304 // ...at some point should added some additional type checking...
305 //get user cutoff and precision or use defaults
307 temp = validParameter.validFile(parameters, "precision", false);
308 if (temp == "not found") { temp = "100"; }
309 //saves precision legnth for formatting below
310 length = temp.length();
311 m->mothurConvert(temp, precision);
313 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
314 hard = m->isTrue(temp);
316 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
317 large = m->isTrue(temp);
319 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
320 m->setProcessors(temp);
321 m->mothurConvert(temp, processors);
323 temp = validParameter.validFile(parameters, "splitmethod", false);
324 if ((splitmethod != "fasta") && (splitmethod != "classify")) {
325 if (temp == "not found") { splitmethod = "distance"; }
326 else { splitmethod = temp; }
329 temp = validParameter.validFile(parameters, "classic", false); if (temp == "not found") { temp = "F"; }
330 classic = m->isTrue(temp);
332 if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
334 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.25"; }
335 m->mothurConvert(temp, cutoff);
336 cutoff += (5 / (precision * 10.0));
338 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "3"; }
339 m->mothurConvert(temp, taxLevelCutoff);
341 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "average"; }
343 if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
344 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
346 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
347 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
349 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
351 showabund = validParameter.validFile(parameters, "showabund", false);
352 if (showabund == "not found") { showabund = "T"; }
354 timing = validParameter.validFile(parameters, "timing", false);
355 if (timing == "not found") { timing = "F"; }
359 catch(exception& e) {
360 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
365 //**********************************************************************************************************************
367 int ClusterSplitCommand::execute(){
370 if (abort == true) { if (calledHelp) { return 0; } return 2; }
373 vector<string> listFileNames;
375 string singletonName = "";
376 double saveCutoff = cutoff;
378 //****************** file prep work ******************************//
383 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
384 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
386 if (pid == 0) { //only process 0 converts and splits
390 //if user gave a phylip file convert to column file
391 if (format == "phylip") {
393 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
395 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
397 NameAssignment* nameMap = NULL;
398 convert->setFormat("phylip");
399 convert->read(nameMap);
401 if (m->control_pressed) { delete convert; return 0; }
403 distfile = convert->getOutputFile();
405 //if no names file given with phylip file, create it
406 ListVector* listToMakeNameFile = convert->getListVector();
407 if ((namefile == "") && (countfile == "")) { //you need to make a namefile for split matrix
409 namefile = phylipfile + ".names";
410 m->openOutputFile(namefile, out);
411 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
412 string bin = listToMakeNameFile->get(i);
413 out << bin << '\t' << bin << endl;
417 delete listToMakeNameFile;
420 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
422 if (m->control_pressed) { return 0; }
425 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
427 //split matrix into non-overlapping groups
429 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large); }
430 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large); }
431 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir); }
432 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
436 if (m->control_pressed) { delete split; return 0; }
438 singletonName = split->getSingletonNames();
439 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
442 //output a merged distance file
443 if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
446 if (m->control_pressed) { return 0; }
448 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
451 //****************** break up files between processes and cluster each file set ******************************//
453 ////you are process 0 from above////
455 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
456 dividedNames.resize(processors);
458 //for each file group figure out which process will complete it
459 //want to divide the load intelligently so the big files are spread between processes
460 for (int i = 0; i < distName.size(); i++) {
461 int processToAssign = (i+1) % processors;
462 if (processToAssign == 0) { processToAssign = processors; }
464 dividedNames[(processToAssign-1)].push_back(distName[i]);
467 //not lets reverse the order of ever other process, so we balance big files running with little ones
468 for (int i = 0; i < processors; i++) {
469 int remainder = ((i+1) % processors);
470 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
474 //send each child the list of files it needs to process
475 for(int i = 1; i < processors; i++) {
476 //send number of file pairs
477 int num = dividedNames[i].size();
478 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
480 for (int j = 0; j < num; j++) { //send filenames to process i
481 char tempDistFileName[1024];
482 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
483 int lengthDist = (dividedNames[i][j].begin()->first).length();
485 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
486 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
488 char tempNameFileName[1024];
489 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
490 int lengthName = (dividedNames[i][j].begin()->second).length();
492 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
493 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
498 listFileNames = cluster(dividedNames[0], labels);
500 //receive the other processes info
501 for(int i = 1; i < processors; i++) {
502 int num = dividedNames[i].size();
505 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
506 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
508 //send list filenames to root process
509 for (int j = 0; j < num; j++) {
511 char tempListFileName[1024];
513 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
514 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
516 string myListFileName = tempListFileName;
517 myListFileName = myListFileName.substr(0, lengthList);
519 listFileNames.push_back(myListFileName);
522 //send Labels to root process
524 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
526 for (int j = 0; j < numLabels; j++) {
530 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
531 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
533 string myLabel = tempLabel;
534 myLabel = myLabel.substr(0, lengthLabel);
536 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
540 }else { //you are a child process
541 vector < map<string, string> > myNames;
543 //recieve the files you need to process
544 //receive number of file pairs
546 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
550 for (int j = 0; j < num; j++) { //receive filenames to process
552 char tempDistFileName[1024];
554 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
555 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
557 string myDistFileName = tempDistFileName;
558 myDistFileName = myDistFileName.substr(0, lengthDist);
561 char tempNameFileName[1024];
563 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
564 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
566 string myNameFileName = tempNameFileName;
567 myNameFileName = myNameFileName.substr(0, lengthName);
570 myNames[j][myDistFileName] = myNameFileName;
574 listFileNames = cluster(myNames, labels);
577 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
579 //send list filenames to root process
580 for (int j = 0; j < num; j++) {
581 char tempListFileName[1024];
582 strcpy(tempListFileName, listFileNames[j].c_str());
583 int lengthList = listFileNames[j].length();
585 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
586 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
589 //send Labels to root process
590 int numLabels = labels.size();
591 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
593 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
595 strcpy(tempLabel, (*it).c_str());
596 int lengthLabel = (*it).length();
598 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
599 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
604 MPI_Barrier(MPI_COMM_WORLD);
607 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
609 if (processors > distName.size()) { processors = distName.size(); }
611 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
613 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
615 listFileNames = createProcesses(distName, labels);
618 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
621 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
623 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
625 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
627 //****************** merge list file and create rabund and sabund files ******************************//
629 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
632 if (pid == 0) { //only process 0 merges
635 ListVector* listSingle;
636 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
638 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
640 mergeLists(listFileNames, labelBins, listSingle);
642 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
644 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
646 //set list file as new current listfile
648 itTypes = outputTypes.find("list");
649 if (itTypes != outputTypes.end()) {
650 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
653 //set rabund file as new current rabundfile
654 itTypes = outputTypes.find("rabund");
655 if (itTypes != outputTypes.end()) {
656 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
659 //set sabund file as new current sabundfile
660 itTypes = outputTypes.find("sabund");
661 if (itTypes != outputTypes.end()) {
662 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
665 m->mothurOutEndLine();
666 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
667 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
668 m->mothurOutEndLine();
671 } //only process 0 merges
674 MPI_Barrier(MPI_COMM_WORLD);
679 catch(exception& e) {
680 m->errorOut(e, "ClusterSplitCommand", "execute");
684 //**********************************************************************************************************************
685 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
688 map<float, int> labelBin;
689 vector<float> orderFloat;
693 if (singleton != "none") {
696 m->openInputFile(singleton, in);
698 string firstCol, secondCol;
699 listSingle = new ListVector();
701 if (countfile != "") { m->getline(in); m->gobble(in); }
704 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
705 if (countfile == "") { listSingle->push_back(secondCol); }
706 else { listSingle->push_back(firstCol); }
710 m->mothurRemove(singleton);
712 numSingleBins = listSingle->getNumBins();
713 }else{ listSingle = NULL; numSingleBins = 0; }
715 //go through users set and make them floats so we can sort them
716 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
719 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
720 else if (*it == "unique") { temp = -1.0; }
722 if (temp <= cutoff) {
723 orderFloat.push_back(temp);
724 labelBin[temp] = numSingleBins; //initialize numbins
729 sort(orderFloat.begin(), orderFloat.end());
732 //get the list info from each file
733 for (int k = 0; k < listNames.size(); k++) {
735 if (m->control_pressed) {
736 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton); }
737 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
741 InputData* input = new InputData(listNames[k], "list");
742 ListVector* list = input->getListVector();
743 string lastLabel = list->getLabel();
745 string filledInList = listNames[k] + "filledInTemp";
747 m->openOutputFile(filledInList, outFilled);
749 //for each label needed
750 for(int l = 0; l < orderFloat.size(); l++){
753 if (orderFloat[l] == -1) { thisLabel = "unique"; }
754 else { thisLabel = toString(orderFloat[l], length-1); }
756 //this file has reached the end
758 list = input->getListVector(lastLabel, true);
759 }else{ //do you have the distance, or do you need to fill in
762 if (list->getLabel() == "unique") { labelFloat = -1.0; }
763 else { convert(list->getLabel(), labelFloat); }
765 //check for missing labels
766 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
767 //if its bigger get last label, otherwise keep it
769 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
771 lastLabel = list->getLabel();
775 list->setLabel(thisLabel);
776 list->print(outFilled);
779 labelBin[orderFloat[l]] += list->getNumBins();
783 list = input->getListVector();
786 if (list != NULL) { delete list; }
790 m->mothurRemove(listNames[k]);
791 rename(filledInList.c_str(), listNames[k].c_str());
796 catch(exception& e) {
797 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
801 //**********************************************************************************************************************
802 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
804 if (outputDir == "") { outputDir += m->hasPath(distfile); }
805 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
807 string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund");
808 string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund");
809 string listFileName = fileroot+ tag + ".";
810 if (countfile != "") { listFileName += "unique_"; }
811 listFileName += getOutputFileNameTag("list");
813 if (countfile == "") {
814 m->openOutputFile(sabundFileName, outSabund);
815 m->openOutputFile(rabundFileName, outRabund);
816 outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
817 outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
820 m->openOutputFile(listFileName, outList);
821 outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
824 map<float, int>::iterator itLabel;
826 //for each label needed
827 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
830 if (itLabel->first == -1) { thisLabel = "unique"; }
831 else { thisLabel = toString(itLabel->first, length-1); }
833 outList << thisLabel << '\t' << itLabel->second << '\t';
835 RAbundVector* rabund = NULL;
836 if (countfile == "") {
837 rabund = new RAbundVector();
838 rabund->setLabel(thisLabel);
842 if (listSingle != NULL) {
843 for (int j = 0; j < listSingle->getNumBins(); j++) {
844 outList << listSingle->get(j) << '\t';
845 if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
849 //get the list info from each file
850 for (int k = 0; k < listNames.size(); k++) {
852 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } if (rabund != NULL) { delete rabund; } return 0; }
854 InputData* input = new InputData(listNames[k], "list");
855 ListVector* list = input->getListVector(thisLabel);
857 //this file has reached the end
858 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
860 for (int j = 0; j < list->getNumBins(); j++) {
861 outList << list->get(j) << '\t';
862 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
869 if (countfile == "") {
870 SAbundVector sabund = rabund->getSAbundVector();
871 sabund.print(outSabund);
872 rabund->print(outRabund);
876 if (rabund != NULL) { delete rabund; }
880 if (countfile == "") {
884 if (listSingle != NULL) { delete listSingle; }
886 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
890 catch(exception& e) {
891 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
896 //**********************************************************************************************************************
898 void ClusterSplitCommand::printData(ListVector* oldList){
900 string label = oldList->getLabel();
901 RAbundVector oldRAbund = oldList->getRAbundVector();
903 oldRAbund.setLabel(label);
904 if (m->isTrue(showabund)) {
905 oldRAbund.getSAbundVector().print(cout);
907 oldRAbund.print(outRabund);
908 oldRAbund.getSAbundVector().print(outSabund);
910 oldList->print(outList);
912 catch(exception& e) {
913 m->errorOut(e, "ClusterSplitCommand", "printData");
917 //**********************************************************************************************************************
918 vector<string> ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
921 vector<string> listFiles;
922 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
923 dividedNames.resize(processors);
925 //for each file group figure out which process will complete it
926 //want to divide the load intelligently so the big files are spread between processes
927 for (int i = 0; i < distName.size(); i++) {
929 int processToAssign = (i+1) % processors;
930 if (processToAssign == 0) { processToAssign = processors; }
932 dividedNames[(processToAssign-1)].push_back(distName[i]);
933 if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
936 //not lets reverse the order of ever other process, so we balance big files running with little ones
937 for (int i = 0; i < processors; i++) {
939 int remainder = ((i+1) % processors);
940 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
943 if (m->control_pressed) { return listFiles; }
945 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
949 //loop through and create all the processes you want
950 while (process != processors) {
954 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
958 vector<string> listFileNames = cluster(dividedNames[process], labels);
960 //write out names to file
961 string filename = toString(getpid()) + ".temp";
963 m->openOutputFile(filename, out);
965 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
970 filename = toString(getpid()) + ".temp.labels";
971 m->openOutputFile(filename, outLabels);
973 outLabels << cutoff << endl;
974 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
975 outLabels << (*it) << endl;
981 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
982 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
988 listFiles = cluster(dividedNames[0], labels);
990 //force parent to wait until all the processes are done
991 for (int i=0;i< processIDS.size();i++) {
992 int temp = processIDS[i];
996 //get list of list file names from each process
997 for(int i=0;i<processIDS.size();i++){
998 string filename = toString(processIDS[i]) + ".temp";
1000 m->openInputFile(filename, in);
1002 in >> tag; m->gobble(in);
1006 in >> tempName; m->gobble(in);
1007 listFiles.push_back(tempName);
1010 m->mothurRemove((toString(processIDS[i]) + ".temp"));
1013 filename = toString(processIDS[i]) + ".temp.labels";
1015 m->openInputFile(filename, in2);
1018 in2 >> tempCutoff; m->gobble(in2);
1019 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1023 in2 >> tempName; m->gobble(in2);
1024 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1027 m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1033 //////////////////////////////////////////////////////////////////////////////////////////////////////
1034 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
1035 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1036 //Taking advantage of shared memory to allow both threads to add labels.
1037 //////////////////////////////////////////////////////////////////////////////////////////////////////
1039 vector<clusterData*> pDataArray;
1040 DWORD dwThreadIdArray[processors-1];
1041 HANDLE hThreadArray[processors-1];
1043 //Create processor worker threads.
1044 for( int i=1; i<processors; i++ ){
1045 // Allocate memory for thread data.
1046 clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1047 pDataArray.push_back(tempCluster);
1048 processIDS.push_back(i);
1050 //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1051 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1052 hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
1057 listFiles = cluster(dividedNames[0], labels);
1059 //Wait until all threads have terminated.
1060 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1062 //Close all thread handles and free memory allocations.
1063 for(int i=0; i < pDataArray.size(); i++){
1065 tag = pDataArray[i]->tag;
1066 //get listfiles created
1067 for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1069 set<string>::iterator it;
1070 for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1072 if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1073 CloseHandle(hThreadArray[i]);
1074 delete pDataArray[i];
1082 catch(exception& e) {
1083 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1087 //**********************************************************************************************************************
1089 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1092 vector<string> listFileNames;
1093 double smallestCutoff = cutoff;
1095 //cluster each distance file
1096 for (int i = 0; i < distNames.size(); i++) {
1098 string thisNamefile = distNames[i].begin()->second;
1099 string thisDistFile = distNames[i].begin()->first;
1101 string listFileName = "";
1102 if (classic) { listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
1103 else { listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
1105 if (m->control_pressed) { //clean up
1106 for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); }
1107 listFileNames.clear(); return listFileNames;
1110 listFileNames.push_back(listFileName);
1113 cutoff = smallestCutoff;
1115 return listFileNames;
1118 catch(exception& e) {
1119 m->errorOut(e, "ClusterSplitCommand", "cluster");
1125 //**********************************************************************************************************************
1126 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1128 string listFileName = "";
1130 ListVector* list = NULL;
1132 RAbundVector* rabund = NULL;
1136 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1138 //output your files too
1140 cout << endl << "Reading " << thisDistFile << endl;
1144 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1146 //reads phylip file storing data in 2D vector, also fills list and rabund
1148 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1150 NameAssignment* nameMap = NULL;
1151 CountTable* ct = NULL;
1153 nameMap = new NameAssignment(thisNamefile);
1155 cluster->readPhylipFile(thisDistFile, nameMap);
1156 }else if (countfile != "") {
1157 ct = new CountTable();
1158 ct->readTable(thisNamefile);
1159 cluster->readPhylipFile(thisDistFile, ct);
1161 tag = cluster->getTag();
1163 if (m->control_pressed) { if(namefile != ""){ delete nameMap; }
1164 else { delete ct; } delete cluster; return 0; }
1166 list = cluster->getListVector();
1167 rabund = cluster->getRAbundVector();
1169 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1170 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1171 listFileName = fileroot+ tag + ".list";
1174 m->openOutputFile(fileroot+ tag + ".list", listFile);
1176 float previousDist = 0.00000;
1177 float rndPreviousDist = 0.00000;
1181 //output your files too
1183 cout << endl << "Clustering " << thisDistFile << endl;
1187 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1189 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1190 if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close(); if(namefile != ""){ delete nameMap; }
1191 else { delete ct; } return listFileName; }
1193 cluster->update(cutoff);
1195 float dist = cluster->getSmallDist();
1198 rndDist = m->ceilDist(dist, precision);
1200 rndDist = m->roundDist(dist, precision);
1203 if(previousDist <= 0.0000 && dist != previousDist){
1204 oldList.setLabel("unique");
1205 oldList.print(listFile);
1206 if (labels.count("unique") == 0) { labels.insert("unique"); }
1208 else if(rndDist != rndPreviousDist){
1209 oldList.setLabel(toString(rndPreviousDist, length-1));
1210 oldList.print(listFile);
1211 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1215 previousDist = dist;
1216 rndPreviousDist = rndDist;
1220 if(previousDist <= 0.0000){
1221 oldList.setLabel("unique");
1222 oldList.print(listFile);
1223 if (labels.count("unique") == 0) { labels.insert("unique"); }
1225 else if(rndPreviousDist<cutoff){
1226 oldList.setLabel(toString(rndPreviousDist, length-1));
1227 oldList.print(listFile);
1228 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1234 delete cluster; delete list; delete rabund;
1235 if(namefile != ""){ delete nameMap; }
1238 m->mothurRemove(thisDistFile);
1239 m->mothurRemove(thisNamefile);
1241 return listFileName;
1244 catch(exception& e) {
1245 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1250 //**********************************************************************************************************************
1251 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1253 string listFileName = "";
1255 Cluster* cluster = NULL;
1256 SparseDistanceMatrix* matrix = NULL;
1257 ListVector* list = NULL;
1259 RAbundVector* rabund = NULL;
1261 if (m->control_pressed) { return listFileName; }
1265 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1267 //output your files too
1269 cout << endl << "Reading " << thisDistFile << endl;
1273 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1275 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1276 read->setCutoff(cutoff);
1278 NameAssignment* nameMap = NULL;
1279 CountTable* ct = NULL;
1281 nameMap = new NameAssignment(thisNamefile);
1283 read->read(nameMap);
1284 }else if (countfile != "") {
1285 ct = new CountTable();
1286 ct->readTable(thisNamefile);
1290 list = read->getListVector();
1292 matrix = read->getDMatrix();
1294 if(countfile != "") {
1295 rabund = new RAbundVector();
1296 createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1298 }else { rabund = new RAbundVector(list->getRAbundVector()); }
1300 delete read; read = NULL;
1301 if (namefile != "") { delete nameMap; nameMap = NULL; }
1305 //output your files too
1307 cout << endl << "Clustering " << thisDistFile << endl;
1311 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1314 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1315 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1316 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
1317 tag = cluster->getTag();
1319 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1320 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1323 m->openOutputFile(fileroot+ tag + ".list", listFile);
1325 listFileName = fileroot+ tag + ".list";
1327 float previousDist = 0.00000;
1328 float rndPreviousDist = 0.00000;
1334 double saveCutoff = cutoff;
1336 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1338 if (m->control_pressed) { //clean up
1339 delete matrix; delete list; delete cluster; delete rabund;
1341 m->mothurRemove(listFileName);
1342 return listFileName;
1345 cluster->update(saveCutoff);
1347 float dist = matrix->getSmallDist();
1350 rndDist = m->ceilDist(dist, precision);
1352 rndDist = m->roundDist(dist, precision);
1355 if(previousDist <= 0.0000 && dist != previousDist){
1356 oldList.setLabel("unique");
1357 oldList.print(listFile);
1358 if (labels.count("unique") == 0) { labels.insert("unique"); }
1360 else if(rndDist != rndPreviousDist){
1361 oldList.setLabel(toString(rndPreviousDist, length-1));
1362 oldList.print(listFile);
1363 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1366 previousDist = dist;
1367 rndPreviousDist = rndDist;
1372 if(previousDist <= 0.0000){
1373 oldList.setLabel("unique");
1374 oldList.print(listFile);
1375 if (labels.count("unique") == 0) { labels.insert("unique"); }
1377 else if(rndPreviousDist<cutoff){
1378 oldList.setLabel(toString(rndPreviousDist, length-1));
1379 oldList.print(listFile);
1380 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1383 delete matrix; delete list; delete cluster; delete rabund;
1384 matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1387 if (m->control_pressed) { //clean up
1388 m->mothurRemove(listFileName);
1389 return listFileName;
1392 m->mothurRemove(thisDistFile);
1393 m->mothurRemove(thisNamefile);
1395 if (saveCutoff != cutoff) {
1396 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1397 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1399 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1402 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1404 return listFileName;
1407 catch(exception& e) {
1408 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1412 //**********************************************************************************************************************
1414 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1419 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1424 string thisOutputDir = outputDir;
1425 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1426 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("column");
1427 m->mothurRemove(outputFileName);
1430 for (int i = 0; i < distNames.size(); i++) {
1431 if (m->control_pressed) { return 0; }
1433 string thisDistFile = distNames[i].begin()->first;
1435 m->appendFiles(thisDistFile, outputFileName);
1438 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1448 catch(exception& e) {
1449 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1453 //**********************************************************************************************************************
1454 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1456 rabund->setLabel(list->getLabel());
1457 for(int i = 0; i < list->getNumBins(); i++) {
1458 if (m->control_pressed) { break; }
1459 vector<string> binNames;
1460 string bin = list->get(i);
1461 m->splitAtComma(bin, binNames);
1463 for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]); }
1464 rabund->push_back(total);
1468 catch(exception& e) {
1469 m->errorOut(e, "ClusterCommand", "createRabund");
1474 //**********************************************************************************************************************