2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
13 //**********************************************************************************************************************
14 vector<string> ClusterSplitCommand::setParameters(){
16 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
17 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
18 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
19 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
20 CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "",false,false); parameters.push_back(pcount);
21 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
22 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
23 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
24 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
25 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
26 CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pcluster);
27 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
28 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
29 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff);
30 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
31 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
32 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
33 CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pclassic);
34 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
35 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
37 vector<string> myArray;
38 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
42 m->errorOut(e, "ClusterSplitCommand", "setParameters");
46 //**********************************************************************************************************************
47 string ClusterSplitCommand::getHelpString(){
49 string helpString = "";
50 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
51 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
52 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
53 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n";
54 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
55 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n";
56 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
57 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
58 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
59 helpString += "The name parameter allows you to enter your name file. \n";
60 helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
61 helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
62 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
63 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
64 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
65 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
66 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
67 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
68 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
69 helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic. It is only valid with splitmethod=fasta. Default=f.\n";
71 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
73 helpString += "The cluster.split command should be in the following format: \n";
74 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
75 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
79 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
83 //**********************************************************************************************************************
84 string ClusterSplitCommand::getOutputFileNameTag(string type, string inputName=""){
86 string outputFileName = "";
87 map<string, vector<string> >::iterator it;
89 //is this a type this command creates
90 it = outputTypes.find(type);
91 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
93 if (type == "list") { outputFileName = "list"; }
94 else if (type == "rabund") { outputFileName = "rabund"; }
95 else if (type == "sabund") { outputFileName = "sabund"; }
96 else if (type == "column") { outputFileName = "dist"; }
97 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
99 return outputFileName;
101 catch(exception& e) {
102 m->errorOut(e, "ClusterSplitCommand", "getOutputFileNameTag");
106 //**********************************************************************************************************************
107 ClusterSplitCommand::ClusterSplitCommand(){
109 abort = true; calledHelp = true;
111 vector<string> tempOutNames;
112 outputTypes["list"] = tempOutNames;
113 outputTypes["rabund"] = tempOutNames;
114 outputTypes["sabund"] = tempOutNames;
115 outputTypes["column"] = tempOutNames;
117 catch(exception& e) {
118 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
122 //**********************************************************************************************************************
123 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
124 ClusterSplitCommand::ClusterSplitCommand(string option) {
126 abort = false; calledHelp = false;
129 //allow user to run help
130 if(option == "help") { help(); abort = true; calledHelp = true; }
131 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
134 vector<string> myArray = setParameters();
136 OptionParser parser(option);
137 map<string,string> parameters = parser.getParameters();
139 ValidParameters validParameter("cluster.split");
141 //check to make sure all parameters are valid for command
142 map<string,string>::iterator it;
143 for (it = parameters.begin(); it != parameters.end(); it++) {
144 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
149 //initialize outputTypes
150 vector<string> tempOutNames;
151 outputTypes["list"] = tempOutNames;
152 outputTypes["rabund"] = tempOutNames;
153 outputTypes["sabund"] = tempOutNames;
154 outputTypes["column"] = tempOutNames;
156 //if the user changes the output directory command factory will send this info to us in the output parameter
157 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
159 //if the user changes the input directory command factory will send this info to us in the output parameter
160 string inputDir = validParameter.validFile(parameters, "inputdir", false);
161 if (inputDir == "not found"){ inputDir = ""; }
164 it = parameters.find("phylip");
165 //user has given a template file
166 if(it != parameters.end()){
167 path = m->hasPath(it->second);
168 //if the user has not given a path then, add inputdir. else leave path alone.
169 if (path == "") { parameters["phylip"] = inputDir + it->second; }
172 it = parameters.find("column");
173 //user has given a template file
174 if(it != parameters.end()){
175 path = m->hasPath(it->second);
176 //if the user has not given a path then, add inputdir. else leave path alone.
177 if (path == "") { parameters["column"] = inputDir + it->second; }
180 it = parameters.find("name");
181 //user has given a template file
182 if(it != parameters.end()){
183 path = m->hasPath(it->second);
184 //if the user has not given a path then, add inputdir. else leave path alone.
185 if (path == "") { parameters["name"] = inputDir + it->second; }
188 it = parameters.find("taxonomy");
189 //user has given a template file
190 if(it != parameters.end()){
191 path = m->hasPath(it->second);
192 //if the user has not given a path then, add inputdir. else leave path alone.
193 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
196 it = parameters.find("fasta");
197 //user has given a template file
198 if(it != parameters.end()){
199 path = m->hasPath(it->second);
200 //if the user has not given a path then, add inputdir. else leave path alone.
201 if (path == "") { parameters["fasta"] = inputDir + it->second; }
204 it = parameters.find("count");
205 //user has given a template file
206 if(it != parameters.end()){
207 path = m->hasPath(it->second);
208 //if the user has not given a path then, add inputdir. else leave path alone.
209 if (path == "") { parameters["count"] = inputDir + it->second; }
213 //check for required parameters
214 phylipfile = validParameter.validFile(parameters, "phylip", true);
215 if (phylipfile == "not open") { abort = true; }
216 else if (phylipfile == "not found") { phylipfile = ""; }
217 else { distfile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
219 columnfile = validParameter.validFile(parameters, "column", true);
220 if (columnfile == "not open") { abort = true; }
221 else if (columnfile == "not found") { columnfile = ""; }
222 else { distfile = columnfile; format = "column"; m->setColumnFile(columnfile); }
224 namefile = validParameter.validFile(parameters, "name", true);
225 if (namefile == "not open") { abort = true; namefile = "";}
226 else if (namefile == "not found") { namefile = ""; }
227 else { m->setNameFile(namefile); }
229 countfile = validParameter.validFile(parameters, "count", true);
230 if (countfile == "not open") { abort = true; countfile = "";}
231 else if (countfile == "not found") { countfile = ""; }
232 else { m->setCountTableFile(countfile); }
234 fastafile = validParameter.validFile(parameters, "fasta", true);
235 if (fastafile == "not open") { abort = true; }
236 else if (fastafile == "not found") { fastafile = ""; }
237 else { distfile = fastafile; splitmethod = "fasta"; m->setFastaFile(fastafile); }
239 taxFile = validParameter.validFile(parameters, "taxonomy", true);
240 if (taxFile == "not open") { taxFile = ""; abort = true; }
241 else if (taxFile == "not found") { taxFile = ""; }
242 else { m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
244 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
245 //is there are current file available for either of these?
246 //give priority to column, then phylip, then fasta
247 columnfile = m->getColumnFile();
248 if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
250 phylipfile = m->getPhylipFile();
251 if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
253 fastafile = m->getFastaFile();
254 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
256 m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
262 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
264 if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
266 if (columnfile != "") {
267 if ((namefile == "") && (countfile == "")) {
268 namefile = m->getNameFile();
269 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
271 countfile = m->getCountTableFile();
272 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
274 m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
281 if (fastafile != "") {
283 taxFile = m->getTaxonomyFile();
284 if (taxFile != "") { m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
286 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
291 if ((namefile == "") && (countfile == "")) {
292 namefile = m->getNameFile();
293 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
295 countfile = m->getCountTableFile();
296 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
298 m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine();
305 //check for optional parameter and set defaults
306 // ...at some point should added some additional type checking...
307 //get user cutoff and precision or use defaults
309 temp = validParameter.validFile(parameters, "precision", false);
310 if (temp == "not found") { temp = "100"; }
311 //saves precision legnth for formatting below
312 length = temp.length();
313 m->mothurConvert(temp, precision);
315 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
316 hard = m->isTrue(temp);
318 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
319 large = m->isTrue(temp);
321 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
322 m->setProcessors(temp);
323 m->mothurConvert(temp, processors);
325 temp = validParameter.validFile(parameters, "splitmethod", false);
326 if ((splitmethod != "fasta") && (splitmethod != "classify")) {
327 if (temp == "not found") { splitmethod = "distance"; }
328 else { splitmethod = temp; }
331 temp = validParameter.validFile(parameters, "classic", false); if (temp == "not found") { temp = "F"; }
332 classic = m->isTrue(temp);
334 if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
336 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.25"; }
337 m->mothurConvert(temp, cutoff);
338 cutoff += (5 / (precision * 10.0));
340 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "3"; }
341 m->mothurConvert(temp, taxLevelCutoff);
343 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "average"; }
345 if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
346 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
348 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
349 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
351 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
353 showabund = validParameter.validFile(parameters, "showabund", false);
354 if (showabund == "not found") { showabund = "T"; }
356 temp = validParameter.validFile(parameters, "cluster", false); if (temp == "not found") { temp = "T"; }
357 runCluster = m->isTrue(temp);
359 timing = validParameter.validFile(parameters, "timing", false);
360 if (timing == "not found") { timing = "F"; }
364 catch(exception& e) {
365 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
370 //**********************************************************************************************************************
372 int ClusterSplitCommand::execute(){
375 if (abort == true) { if (calledHelp) { return 0; } return 2; }
378 vector<string> listFileNames;
380 string singletonName = "";
381 double saveCutoff = cutoff;
383 //****************** file prep work ******************************//
388 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
389 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
391 if (pid == 0) { //only process 0 converts and splits
395 //if user gave a phylip file convert to column file
396 if (format == "phylip") {
398 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
400 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
402 NameAssignment* nameMap = NULL;
403 convert->setFormat("phylip");
404 convert->read(nameMap);
406 if (m->control_pressed) { delete convert; return 0; }
408 distfile = convert->getOutputFile();
410 //if no names file given with phylip file, create it
411 ListVector* listToMakeNameFile = convert->getListVector();
412 if ((namefile == "") && (countfile == "")) { //you need to make a namefile for split matrix
414 namefile = phylipfile + ".names";
415 m->openOutputFile(namefile, out);
416 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
417 string bin = listToMakeNameFile->get(i);
418 out << bin << '\t' << bin << endl;
422 delete listToMakeNameFile;
425 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
427 if (m->control_pressed) { return 0; }
430 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
432 //split matrix into non-overlapping groups
434 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large); }
435 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large); }
436 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir); }
437 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
441 if (m->control_pressed) { delete split; return 0; }
443 singletonName = split->getSingletonNames();
444 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
447 //output a merged distance file
448 if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
451 if (m->control_pressed) { return 0; }
453 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
460 m->mothurOutEndLine();
461 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
462 for (int i = 0; i < distName.size(); i++) { m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine(); }
463 m->mothurOutEndLine();
468 //****************** break up files between processes and cluster each file set ******************************//
470 ////you are process 0 from above////
472 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
473 dividedNames.resize(processors);
475 //for each file group figure out which process will complete it
476 //want to divide the load intelligently so the big files are spread between processes
477 for (int i = 0; i < distName.size(); i++) {
478 int processToAssign = (i+1) % processors;
479 if (processToAssign == 0) { processToAssign = processors; }
481 dividedNames[(processToAssign-1)].push_back(distName[i]);
484 //not lets reverse the order of ever other process, so we balance big files running with little ones
485 for (int i = 0; i < processors; i++) {
486 int remainder = ((i+1) % processors);
487 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
491 //send each child the list of files it needs to process
492 for(int i = 1; i < processors; i++) {
493 //send number of file pairs
494 int num = dividedNames[i].size();
495 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
497 for (int j = 0; j < num; j++) { //send filenames to process i
498 char tempDistFileName[1024];
499 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
500 int lengthDist = (dividedNames[i][j].begin()->first).length();
502 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
503 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
505 char tempNameFileName[1024];
506 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
507 int lengthName = (dividedNames[i][j].begin()->second).length();
509 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
510 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
515 listFileNames = cluster(dividedNames[0], labels);
517 //receive the other processes info
518 for(int i = 1; i < processors; i++) {
519 int num = dividedNames[i].size();
522 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
523 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
525 //send list filenames to root process
526 for (int j = 0; j < num; j++) {
528 char tempListFileName[1024];
530 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
531 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
533 string myListFileName = tempListFileName;
534 myListFileName = myListFileName.substr(0, lengthList);
536 listFileNames.push_back(myListFileName);
539 //send Labels to root process
541 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
543 for (int j = 0; j < numLabels; j++) {
547 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
548 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
550 string myLabel = tempLabel;
551 myLabel = myLabel.substr(0, lengthLabel);
553 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
557 }else { //you are a child process
558 vector < map<string, string> > myNames;
560 //recieve the files you need to process
561 //receive number of file pairs
563 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
567 for (int j = 0; j < num; j++) { //receive filenames to process
569 char tempDistFileName[1024];
571 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
572 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
574 string myDistFileName = tempDistFileName;
575 myDistFileName = myDistFileName.substr(0, lengthDist);
578 char tempNameFileName[1024];
580 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
581 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
583 string myNameFileName = tempNameFileName;
584 myNameFileName = myNameFileName.substr(0, lengthName);
587 myNames[j][myDistFileName] = myNameFileName;
591 listFileNames = cluster(myNames, labels);
594 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
596 //send list filenames to root process
597 for (int j = 0; j < num; j++) {
598 char tempListFileName[1024];
599 strcpy(tempListFileName, listFileNames[j].c_str());
600 int lengthList = listFileNames[j].length();
602 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
603 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
606 //send Labels to root process
607 int numLabels = labels.size();
608 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
610 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
612 strcpy(tempLabel, (*it).c_str());
613 int lengthLabel = (*it).length();
615 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
616 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
621 MPI_Barrier(MPI_COMM_WORLD);
624 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
626 if (processors > distName.size()) { processors = distName.size(); }
628 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
630 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
632 listFileNames = createProcesses(distName, labels);
635 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
638 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
640 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
642 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
644 //****************** merge list file and create rabund and sabund files ******************************//
646 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
649 if (pid == 0) { //only process 0 merges
652 ListVector* listSingle;
653 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
655 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
657 mergeLists(listFileNames, labelBins, listSingle);
659 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
661 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
663 //set list file as new current listfile
665 itTypes = outputTypes.find("list");
666 if (itTypes != outputTypes.end()) {
667 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
670 //set rabund file as new current rabundfile
671 itTypes = outputTypes.find("rabund");
672 if (itTypes != outputTypes.end()) {
673 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
676 //set sabund file as new current sabundfile
677 itTypes = outputTypes.find("sabund");
678 if (itTypes != outputTypes.end()) {
679 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
682 m->mothurOutEndLine();
683 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
684 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
685 m->mothurOutEndLine();
688 } //only process 0 merges
691 MPI_Barrier(MPI_COMM_WORLD);
696 catch(exception& e) {
697 m->errorOut(e, "ClusterSplitCommand", "execute");
701 //**********************************************************************************************************************
702 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
705 map<float, int> labelBin;
706 vector<float> orderFloat;
710 if (singleton != "none") {
713 m->openInputFile(singleton, in);
715 string firstCol, secondCol;
716 listSingle = new ListVector();
718 if (countfile != "") { m->getline(in); m->gobble(in); }
721 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
722 if (countfile == "") { listSingle->push_back(secondCol); }
723 else { listSingle->push_back(firstCol); }
727 m->mothurRemove(singleton);
729 numSingleBins = listSingle->getNumBins();
730 }else{ listSingle = NULL; numSingleBins = 0; }
732 //go through users set and make them floats so we can sort them
733 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
736 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
737 else if (*it == "unique") { temp = -1.0; }
739 if (temp <= cutoff) {
740 orderFloat.push_back(temp);
741 labelBin[temp] = numSingleBins; //initialize numbins
746 sort(orderFloat.begin(), orderFloat.end());
749 //get the list info from each file
750 for (int k = 0; k < listNames.size(); k++) {
752 if (m->control_pressed) {
753 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton); }
754 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
758 InputData* input = new InputData(listNames[k], "list");
759 ListVector* list = input->getListVector();
760 string lastLabel = list->getLabel();
762 string filledInList = listNames[k] + "filledInTemp";
764 m->openOutputFile(filledInList, outFilled);
766 //for each label needed
767 for(int l = 0; l < orderFloat.size(); l++){
770 if (orderFloat[l] == -1) { thisLabel = "unique"; }
771 else { thisLabel = toString(orderFloat[l], length-1); }
773 //this file has reached the end
775 list = input->getListVector(lastLabel, true);
776 }else{ //do you have the distance, or do you need to fill in
779 if (list->getLabel() == "unique") { labelFloat = -1.0; }
780 else { convert(list->getLabel(), labelFloat); }
782 //check for missing labels
783 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
784 //if its bigger get last label, otherwise keep it
786 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
788 lastLabel = list->getLabel();
792 list->setLabel(thisLabel);
793 list->print(outFilled);
796 labelBin[orderFloat[l]] += list->getNumBins();
800 list = input->getListVector();
803 if (list != NULL) { delete list; }
807 m->mothurRemove(listNames[k]);
808 rename(filledInList.c_str(), listNames[k].c_str());
813 catch(exception& e) {
814 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
818 //**********************************************************************************************************************
819 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
821 if (outputDir == "") { outputDir += m->hasPath(distfile); }
822 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
824 string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund");
825 string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund");
826 string listFileName = fileroot+ tag + ".";
827 if (countfile != "") { listFileName += "unique_"; }
828 listFileName += getOutputFileNameTag("list");
830 if (countfile == "") {
831 m->openOutputFile(sabundFileName, outSabund);
832 m->openOutputFile(rabundFileName, outRabund);
833 outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
834 outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
837 m->openOutputFile(listFileName, outList);
838 outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
841 map<float, int>::iterator itLabel;
843 //for each label needed
844 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
847 if (itLabel->first == -1) { thisLabel = "unique"; }
848 else { thisLabel = toString(itLabel->first, length-1); }
850 outList << thisLabel << '\t' << itLabel->second << '\t';
852 RAbundVector* rabund = NULL;
853 if (countfile == "") {
854 rabund = new RAbundVector();
855 rabund->setLabel(thisLabel);
859 if (listSingle != NULL) {
860 for (int j = 0; j < listSingle->getNumBins(); j++) {
861 outList << listSingle->get(j) << '\t';
862 if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
866 //get the list info from each file
867 for (int k = 0; k < listNames.size(); k++) {
869 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } if (rabund != NULL) { delete rabund; } return 0; }
871 InputData* input = new InputData(listNames[k], "list");
872 ListVector* list = input->getListVector(thisLabel);
874 //this file has reached the end
875 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
877 for (int j = 0; j < list->getNumBins(); j++) {
878 outList << list->get(j) << '\t';
879 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
886 if (countfile == "") {
887 SAbundVector sabund = rabund->getSAbundVector();
888 sabund.print(outSabund);
889 rabund->print(outRabund);
893 if (rabund != NULL) { delete rabund; }
897 if (countfile == "") {
901 if (listSingle != NULL) { delete listSingle; }
903 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
907 catch(exception& e) {
908 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
913 //**********************************************************************************************************************
915 void ClusterSplitCommand::printData(ListVector* oldList){
917 string label = oldList->getLabel();
918 RAbundVector oldRAbund = oldList->getRAbundVector();
920 oldRAbund.setLabel(label);
921 if (m->isTrue(showabund)) {
922 oldRAbund.getSAbundVector().print(cout);
924 oldRAbund.print(outRabund);
925 oldRAbund.getSAbundVector().print(outSabund);
927 oldList->print(outList);
929 catch(exception& e) {
930 m->errorOut(e, "ClusterSplitCommand", "printData");
934 //**********************************************************************************************************************
935 vector<string> ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
938 vector<string> listFiles;
939 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
940 dividedNames.resize(processors);
942 //for each file group figure out which process will complete it
943 //want to divide the load intelligently so the big files are spread between processes
944 for (int i = 0; i < distName.size(); i++) {
946 int processToAssign = (i+1) % processors;
947 if (processToAssign == 0) { processToAssign = processors; }
949 dividedNames[(processToAssign-1)].push_back(distName[i]);
950 if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
953 //not lets reverse the order of ever other process, so we balance big files running with little ones
954 for (int i = 0; i < processors; i++) {
956 int remainder = ((i+1) % processors);
957 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
960 if (m->control_pressed) { return listFiles; }
962 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
966 //loop through and create all the processes you want
967 while (process != processors) {
971 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
975 vector<string> listFileNames = cluster(dividedNames[process], labels);
977 //write out names to file
978 string filename = toString(getpid()) + ".temp";
980 m->openOutputFile(filename, out);
982 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
987 filename = toString(getpid()) + ".temp.labels";
988 m->openOutputFile(filename, outLabels);
990 outLabels << cutoff << endl;
991 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
992 outLabels << (*it) << endl;
998 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
999 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1005 listFiles = cluster(dividedNames[0], labels);
1007 //force parent to wait until all the processes are done
1008 for (int i=0;i< processIDS.size();i++) {
1009 int temp = processIDS[i];
1013 //get list of list file names from each process
1014 for(int i=0;i<processIDS.size();i++){
1015 string filename = toString(processIDS[i]) + ".temp";
1017 m->openInputFile(filename, in);
1019 in >> tag; m->gobble(in);
1023 in >> tempName; m->gobble(in);
1024 listFiles.push_back(tempName);
1027 m->mothurRemove((toString(processIDS[i]) + ".temp"));
1030 filename = toString(processIDS[i]) + ".temp.labels";
1032 m->openInputFile(filename, in2);
1035 in2 >> tempCutoff; m->gobble(in2);
1036 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1040 in2 >> tempName; m->gobble(in2);
1041 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1044 m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1050 //////////////////////////////////////////////////////////////////////////////////////////////////////
1051 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
1052 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1053 //Taking advantage of shared memory to allow both threads to add labels.
1054 //////////////////////////////////////////////////////////////////////////////////////////////////////
1056 vector<clusterData*> pDataArray;
1057 DWORD dwThreadIdArray[processors-1];
1058 HANDLE hThreadArray[processors-1];
1060 //Create processor worker threads.
1061 for( int i=1; i<processors; i++ ){
1062 // Allocate memory for thread data.
1063 clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1064 pDataArray.push_back(tempCluster);
1065 processIDS.push_back(i);
1067 //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1068 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1069 hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
1074 listFiles = cluster(dividedNames[0], labels);
1076 //Wait until all threads have terminated.
1077 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1079 //Close all thread handles and free memory allocations.
1080 for(int i=0; i < pDataArray.size(); i++){
1082 tag = pDataArray[i]->tag;
1083 //get listfiles created
1084 for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1086 set<string>::iterator it;
1087 for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1089 if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1090 CloseHandle(hThreadArray[i]);
1091 delete pDataArray[i];
1099 catch(exception& e) {
1100 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1104 //**********************************************************************************************************************
1106 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1109 vector<string> listFileNames;
1110 double smallestCutoff = cutoff;
1112 //cluster each distance file
1113 for (int i = 0; i < distNames.size(); i++) {
1115 string thisNamefile = distNames[i].begin()->second;
1116 string thisDistFile = distNames[i].begin()->first;
1118 string listFileName = "";
1119 if (classic) { listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
1120 else { listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
1122 if (m->control_pressed) { //clean up
1123 for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); }
1124 listFileNames.clear(); return listFileNames;
1127 listFileNames.push_back(listFileName);
1130 cutoff = smallestCutoff;
1132 return listFileNames;
1135 catch(exception& e) {
1136 m->errorOut(e, "ClusterSplitCommand", "cluster");
1142 //**********************************************************************************************************************
1143 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1145 string listFileName = "";
1147 ListVector* list = NULL;
1149 RAbundVector* rabund = NULL;
1153 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1155 //output your files too
1157 cout << endl << "Reading " << thisDistFile << endl;
1161 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1163 //reads phylip file storing data in 2D vector, also fills list and rabund
1165 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1167 NameAssignment* nameMap = NULL;
1168 CountTable* ct = NULL;
1170 nameMap = new NameAssignment(thisNamefile);
1172 cluster->readPhylipFile(thisDistFile, nameMap);
1173 }else if (countfile != "") {
1174 ct = new CountTable();
1175 ct->readTable(thisNamefile);
1176 cluster->readPhylipFile(thisDistFile, ct);
1178 tag = cluster->getTag();
1180 if (m->control_pressed) { if(namefile != ""){ delete nameMap; }
1181 else { delete ct; } delete cluster; return 0; }
1183 list = cluster->getListVector();
1184 rabund = cluster->getRAbundVector();
1186 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1187 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1188 listFileName = fileroot+ tag + ".list";
1191 m->openOutputFile(fileroot+ tag + ".list", listFile);
1193 float previousDist = 0.00000;
1194 float rndPreviousDist = 0.00000;
1198 //output your files too
1200 cout << endl << "Clustering " << thisDistFile << endl;
1204 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1206 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1207 if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close(); if(namefile != ""){ delete nameMap; }
1208 else { delete ct; } return listFileName; }
1210 cluster->update(cutoff);
1212 float dist = cluster->getSmallDist();
1215 rndDist = m->ceilDist(dist, precision);
1217 rndDist = m->roundDist(dist, precision);
1220 if(previousDist <= 0.0000 && dist != previousDist){
1221 oldList.setLabel("unique");
1222 oldList.print(listFile);
1223 if (labels.count("unique") == 0) { labels.insert("unique"); }
1225 else if(rndDist != rndPreviousDist){
1226 oldList.setLabel(toString(rndPreviousDist, length-1));
1227 oldList.print(listFile);
1228 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1232 previousDist = dist;
1233 rndPreviousDist = rndDist;
1237 if(previousDist <= 0.0000){
1238 oldList.setLabel("unique");
1239 oldList.print(listFile);
1240 if (labels.count("unique") == 0) { labels.insert("unique"); }
1242 else if(rndPreviousDist<cutoff){
1243 oldList.setLabel(toString(rndPreviousDist, length-1));
1244 oldList.print(listFile);
1245 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1251 delete cluster; delete list; delete rabund;
1252 if(namefile != ""){ delete nameMap; }
1255 m->mothurRemove(thisDistFile);
1256 m->mothurRemove(thisNamefile);
1258 return listFileName;
1261 catch(exception& e) {
1262 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1267 //**********************************************************************************************************************
1268 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1270 string listFileName = "";
1272 Cluster* cluster = NULL;
1273 SparseDistanceMatrix* matrix = NULL;
1274 ListVector* list = NULL;
1276 RAbundVector* rabund = NULL;
1278 if (m->control_pressed) { return listFileName; }
1282 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1284 //output your files too
1286 cout << endl << "Reading " << thisDistFile << endl;
1290 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1292 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1293 read->setCutoff(cutoff);
1295 NameAssignment* nameMap = NULL;
1296 CountTable* ct = NULL;
1298 nameMap = new NameAssignment(thisNamefile);
1300 read->read(nameMap);
1301 }else if (countfile != "") {
1302 ct = new CountTable();
1303 ct->readTable(thisNamefile);
1305 }else { read->read(nameMap); }
1307 list = read->getListVector();
1309 matrix = read->getDMatrix();
1311 if(countfile != "") {
1312 rabund = new RAbundVector();
1313 createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1315 }else { rabund = new RAbundVector(list->getRAbundVector()); }
1317 delete read; read = NULL;
1318 if (namefile != "") { delete nameMap; nameMap = NULL; }
1322 //output your files too
1324 cout << endl << "Clustering " << thisDistFile << endl;
1328 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1331 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1332 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1333 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
1334 tag = cluster->getTag();
1336 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1337 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1340 m->openOutputFile(fileroot+ tag + ".list", listFile);
1342 listFileName = fileroot+ tag + ".list";
1344 float previousDist = 0.00000;
1345 float rndPreviousDist = 0.00000;
1351 double saveCutoff = cutoff;
1353 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1355 if (m->control_pressed) { //clean up
1356 delete matrix; delete list; delete cluster; delete rabund;
1358 m->mothurRemove(listFileName);
1359 return listFileName;
1362 cluster->update(saveCutoff);
1364 float dist = matrix->getSmallDist();
1367 rndDist = m->ceilDist(dist, precision);
1369 rndDist = m->roundDist(dist, precision);
1372 if(previousDist <= 0.0000 && dist != previousDist){
1373 oldList.setLabel("unique");
1374 oldList.print(listFile);
1375 if (labels.count("unique") == 0) { labels.insert("unique"); }
1377 else if(rndDist != rndPreviousDist){
1378 oldList.setLabel(toString(rndPreviousDist, length-1));
1379 oldList.print(listFile);
1380 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1383 previousDist = dist;
1384 rndPreviousDist = rndDist;
1389 if(previousDist <= 0.0000){
1390 oldList.setLabel("unique");
1391 oldList.print(listFile);
1392 if (labels.count("unique") == 0) { labels.insert("unique"); }
1394 else if(rndPreviousDist<cutoff){
1395 oldList.setLabel(toString(rndPreviousDist, length-1));
1396 oldList.print(listFile);
1397 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1400 delete matrix; delete list; delete cluster; delete rabund;
1401 matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1404 if (m->control_pressed) { //clean up
1405 m->mothurRemove(listFileName);
1406 return listFileName;
1409 m->mothurRemove(thisDistFile);
1410 m->mothurRemove(thisNamefile);
1412 if (saveCutoff != cutoff) {
1413 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1414 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1416 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1419 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1421 return listFileName;
1424 catch(exception& e) {
1425 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1429 //**********************************************************************************************************************
1431 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1436 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1441 string thisOutputDir = outputDir;
1442 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1443 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("column");
1444 m->mothurRemove(outputFileName);
1447 for (int i = 0; i < distNames.size(); i++) {
1448 if (m->control_pressed) { return 0; }
1450 string thisDistFile = distNames[i].begin()->first;
1452 m->appendFiles(thisDistFile, outputFileName);
1455 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1465 catch(exception& e) {
1466 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1470 //**********************************************************************************************************************
1471 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1473 rabund->setLabel(list->getLabel());
1474 for(int i = 0; i < list->getNumBins(); i++) {
1475 if (m->control_pressed) { break; }
1476 vector<string> binNames;
1477 string bin = list->get(i);
1478 m->splitAtComma(bin, binNames);
1480 for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]); }
1481 rabund->push_back(total);
1485 catch(exception& e) {
1486 m->errorOut(e, "ClusterCommand", "createRabund");
1491 //**********************************************************************************************************************