2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
13 //**********************************************************************************************************************
14 vector<string> ClusterSplitCommand::setParameters(){
16 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
17 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
18 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
19 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
20 CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
21 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
22 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
23 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "","",false,false,true); parameters.push_back(psplitmethod);
24 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
25 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
26 CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
27 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
28 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
29 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff);
30 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
31 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false); parameters.push_back(pmethod);
32 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
33 CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
34 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
35 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
37 vector<string> myArray;
38 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
42 m->errorOut(e, "ClusterSplitCommand", "setParameters");
46 //**********************************************************************************************************************
47 string ClusterSplitCommand::getHelpString(){
49 string helpString = "";
50 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
51 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
52 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
53 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n";
54 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
55 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n";
56 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
57 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
58 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
59 helpString += "The name parameter allows you to enter your name file. \n";
60 helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
61 helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
62 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
63 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
64 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
65 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
66 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
67 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
68 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
69 helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic. It is only valid with splitmethod=fasta. Default=f.\n";
71 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
73 helpString += "The cluster.split command should be in the following format: \n";
74 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
75 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
79 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
83 //**********************************************************************************************************************
84 string ClusterSplitCommand::getOutputPattern(string type) {
88 if (type == "list") { pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
89 else if (type == "rabund") { pattern = "[filename],[clustertag],rabund"; }
90 else if (type == "sabund") { pattern = "[filename],[clustertag],sabund"; }
91 else if (type == "column") { pattern = "[filename],dist"; }
92 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
97 m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
101 //**********************************************************************************************************************
102 ClusterSplitCommand::ClusterSplitCommand(){
104 abort = true; calledHelp = true;
106 vector<string> tempOutNames;
107 outputTypes["list"] = tempOutNames;
108 outputTypes["rabund"] = tempOutNames;
109 outputTypes["sabund"] = tempOutNames;
110 outputTypes["column"] = tempOutNames;
112 catch(exception& e) {
113 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
117 //**********************************************************************************************************************
118 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
119 ClusterSplitCommand::ClusterSplitCommand(string option) {
121 abort = false; calledHelp = false;
124 //allow user to run help
125 if(option == "help") { help(); abort = true; calledHelp = true; }
126 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
129 vector<string> myArray = setParameters();
131 OptionParser parser(option);
132 map<string,string> parameters = parser.getParameters();
134 ValidParameters validParameter("cluster.split");
136 //check to make sure all parameters are valid for command
137 map<string,string>::iterator it;
138 for (it = parameters.begin(); it != parameters.end(); it++) {
139 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
144 //initialize outputTypes
145 vector<string> tempOutNames;
146 outputTypes["list"] = tempOutNames;
147 outputTypes["rabund"] = tempOutNames;
148 outputTypes["sabund"] = tempOutNames;
149 outputTypes["column"] = tempOutNames;
151 //if the user changes the output directory command factory will send this info to us in the output parameter
152 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
154 //if the user changes the input directory command factory will send this info to us in the output parameter
155 string inputDir = validParameter.validFile(parameters, "inputdir", false);
156 if (inputDir == "not found"){ inputDir = ""; }
159 it = parameters.find("phylip");
160 //user has given a template file
161 if(it != parameters.end()){
162 path = m->hasPath(it->second);
163 //if the user has not given a path then, add inputdir. else leave path alone.
164 if (path == "") { parameters["phylip"] = inputDir + it->second; }
167 it = parameters.find("column");
168 //user has given a template file
169 if(it != parameters.end()){
170 path = m->hasPath(it->second);
171 //if the user has not given a path then, add inputdir. else leave path alone.
172 if (path == "") { parameters["column"] = inputDir + it->second; }
175 it = parameters.find("name");
176 //user has given a template file
177 if(it != parameters.end()){
178 path = m->hasPath(it->second);
179 //if the user has not given a path then, add inputdir. else leave path alone.
180 if (path == "") { parameters["name"] = inputDir + it->second; }
183 it = parameters.find("taxonomy");
184 //user has given a template file
185 if(it != parameters.end()){
186 path = m->hasPath(it->second);
187 //if the user has not given a path then, add inputdir. else leave path alone.
188 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
191 it = parameters.find("fasta");
192 //user has given a template file
193 if(it != parameters.end()){
194 path = m->hasPath(it->second);
195 //if the user has not given a path then, add inputdir. else leave path alone.
196 if (path == "") { parameters["fasta"] = inputDir + it->second; }
199 it = parameters.find("count");
200 //user has given a template file
201 if(it != parameters.end()){
202 path = m->hasPath(it->second);
203 //if the user has not given a path then, add inputdir. else leave path alone.
204 if (path == "") { parameters["count"] = inputDir + it->second; }
208 //check for required parameters
209 phylipfile = validParameter.validFile(parameters, "phylip", true);
210 if (phylipfile == "not open") { abort = true; }
211 else if (phylipfile == "not found") { phylipfile = ""; }
212 else { distfile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
214 columnfile = validParameter.validFile(parameters, "column", true);
215 if (columnfile == "not open") { abort = true; }
216 else if (columnfile == "not found") { columnfile = ""; }
217 else { distfile = columnfile; format = "column"; m->setColumnFile(columnfile); }
219 namefile = validParameter.validFile(parameters, "name", true);
220 if (namefile == "not open") { abort = true; namefile = "";}
221 else if (namefile == "not found") { namefile = ""; }
222 else { m->setNameFile(namefile); }
224 countfile = validParameter.validFile(parameters, "count", true);
225 if (countfile == "not open") { abort = true; countfile = "";}
226 else if (countfile == "not found") { countfile = ""; }
227 else { m->setCountTableFile(countfile); }
229 fastafile = validParameter.validFile(parameters, "fasta", true);
230 if (fastafile == "not open") { abort = true; }
231 else if (fastafile == "not found") { fastafile = ""; }
232 else { distfile = fastafile; splitmethod = "fasta"; m->setFastaFile(fastafile); }
234 taxFile = validParameter.validFile(parameters, "taxonomy", true);
235 if (taxFile == "not open") { taxFile = ""; abort = true; }
236 else if (taxFile == "not found") { taxFile = ""; }
237 else { m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
239 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
240 //is there are current file available for either of these?
241 //give priority to column, then phylip, then fasta
242 columnfile = m->getColumnFile();
243 if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
245 phylipfile = m->getPhylipFile();
246 if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
248 fastafile = m->getFastaFile();
249 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
251 m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
257 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
259 if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
261 if (columnfile != "") {
262 if ((namefile == "") && (countfile == "")) {
263 namefile = m->getNameFile();
264 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
266 countfile = m->getCountTableFile();
267 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
269 m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
276 if (fastafile != "") {
278 taxFile = m->getTaxonomyFile();
279 if (taxFile != "") { m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
281 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
286 if ((namefile == "") && (countfile == "")) {
287 namefile = m->getNameFile();
288 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
290 countfile = m->getCountTableFile();
291 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
293 m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine();
300 //check for optional parameter and set defaults
301 // ...at some point should added some additional type checking...
302 //get user cutoff and precision or use defaults
304 temp = validParameter.validFile(parameters, "precision", false);
305 if (temp == "not found") { temp = "100"; }
306 //saves precision legnth for formatting below
307 length = temp.length();
308 m->mothurConvert(temp, precision);
310 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
311 hard = m->isTrue(temp);
313 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
314 large = m->isTrue(temp);
316 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
317 m->setProcessors(temp);
318 m->mothurConvert(temp, processors);
320 temp = validParameter.validFile(parameters, "splitmethod", false);
321 if ((splitmethod != "fasta") && (splitmethod != "classify")) {
322 if (temp == "not found") { splitmethod = "distance"; }
323 else { splitmethod = temp; }
326 temp = validParameter.validFile(parameters, "classic", false); if (temp == "not found") { temp = "F"; }
327 classic = m->isTrue(temp);
329 if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
331 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.25"; }
332 m->mothurConvert(temp, cutoff);
333 cutoff += (5 / (precision * 10.0));
335 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "3"; }
336 m->mothurConvert(temp, taxLevelCutoff);
338 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "average"; }
340 if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
341 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
343 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
344 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
346 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
348 showabund = validParameter.validFile(parameters, "showabund", false);
349 if (showabund == "not found") { showabund = "T"; }
351 temp = validParameter.validFile(parameters, "cluster", false); if (temp == "not found") { temp = "T"; }
352 runCluster = m->isTrue(temp);
354 timing = validParameter.validFile(parameters, "timing", false);
355 if (timing == "not found") { timing = "F"; }
359 catch(exception& e) {
360 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
365 //**********************************************************************************************************************
367 int ClusterSplitCommand::execute(){
370 if (abort == true) { if (calledHelp) { return 0; } return 2; }
373 vector<string> listFileNames;
375 string singletonName = "";
376 double saveCutoff = cutoff;
378 //****************** file prep work ******************************//
383 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
384 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
386 if (pid == 0) { //only process 0 converts and splits
390 //if user gave a phylip file convert to column file
391 if (format == "phylip") {
393 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
395 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
397 NameAssignment* nameMap = NULL;
398 convert->setFormat("phylip");
399 convert->read(nameMap);
401 if (m->control_pressed) { delete convert; return 0; }
403 distfile = convert->getOutputFile();
405 //if no names file given with phylip file, create it
406 ListVector* listToMakeNameFile = convert->getListVector();
407 if ((namefile == "") && (countfile == "")) { //you need to make a namefile for split matrix
409 namefile = phylipfile + ".names";
410 m->openOutputFile(namefile, out);
411 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
412 string bin = listToMakeNameFile->get(i);
413 out << bin << '\t' << bin << endl;
417 delete listToMakeNameFile;
420 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
422 if (m->control_pressed) { return 0; }
425 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
427 //split matrix into non-overlapping groups
429 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large); }
430 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large); }
431 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir); }
432 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
436 if (m->control_pressed) { delete split; return 0; }
438 singletonName = split->getSingletonNames();
439 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
442 //output a merged distance file
443 if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
446 if (m->control_pressed) { return 0; }
448 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
453 m->mothurOutEndLine();
454 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
455 for (int i = 0; i < distName.size(); i++) { m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine(); }
456 m->mothurOutEndLine();
461 //****************** break up files between processes and cluster each file set ******************************//
463 ////you are process 0 from above////
465 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
466 dividedNames.resize(processors);
468 //for each file group figure out which process will complete it
469 //want to divide the load intelligently so the big files are spread between processes
470 for (int i = 0; i < distName.size(); i++) {
471 int processToAssign = (i+1) % processors;
472 if (processToAssign == 0) { processToAssign = processors; }
474 dividedNames[(processToAssign-1)].push_back(distName[i]);
477 //not lets reverse the order of ever other process, so we balance big files running with little ones
478 for (int i = 0; i < processors; i++) {
479 int remainder = ((i+1) % processors);
480 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
484 //send each child the list of files it needs to process
485 for(int i = 1; i < processors; i++) {
486 //send number of file pairs
487 int num = dividedNames[i].size();
488 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
490 for (int j = 0; j < num; j++) { //send filenames to process i
491 char tempDistFileName[1024];
492 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
493 int lengthDist = (dividedNames[i][j].begin()->first).length();
495 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
496 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
498 char tempNameFileName[1024];
499 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
500 int lengthName = (dividedNames[i][j].begin()->second).length();
502 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
503 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
508 listFileNames = cluster(dividedNames[0], labels);
510 //receive the other processes info
511 for(int i = 1; i < processors; i++) {
512 int num = dividedNames[i].size();
515 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
516 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
518 //send list filenames to root process
519 for (int j = 0; j < num; j++) {
521 char tempListFileName[1024];
523 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
524 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
526 string myListFileName = tempListFileName;
527 myListFileName = myListFileName.substr(0, lengthList);
529 listFileNames.push_back(myListFileName);
532 //send Labels to root process
534 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
536 for (int j = 0; j < numLabels; j++) {
540 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
541 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
543 string myLabel = tempLabel;
544 myLabel = myLabel.substr(0, lengthLabel);
546 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
550 }else { //you are a child process
551 vector < map<string, string> > myNames;
553 //recieve the files you need to process
554 //receive number of file pairs
556 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
560 for (int j = 0; j < num; j++) { //receive filenames to process
562 char tempDistFileName[1024];
564 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
565 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
567 string myDistFileName = tempDistFileName;
568 myDistFileName = myDistFileName.substr(0, lengthDist);
571 char tempNameFileName[1024];
573 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
574 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
576 string myNameFileName = tempNameFileName;
577 myNameFileName = myNameFileName.substr(0, lengthName);
580 myNames[j][myDistFileName] = myNameFileName;
584 listFileNames = cluster(myNames, labels);
587 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
589 //send list filenames to root process
590 for (int j = 0; j < num; j++) {
591 char tempListFileName[1024];
592 strcpy(tempListFileName, listFileNames[j].c_str());
593 int lengthList = listFileNames[j].length();
595 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
596 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
599 //send Labels to root process
600 int numLabels = labels.size();
601 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
603 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
605 strcpy(tempLabel, (*it).c_str());
606 int lengthLabel = (*it).length();
608 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
609 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
614 MPI_Barrier(MPI_COMM_WORLD);
617 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
619 if (processors > distName.size()) { processors = distName.size(); }
621 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
623 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
625 listFileNames = createProcesses(distName, labels);
628 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
631 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
633 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
635 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
637 //****************** merge list file and create rabund and sabund files ******************************//
639 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
642 if (pid == 0) { //only process 0 merges
645 ListVector* listSingle;
646 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
648 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
650 mergeLists(listFileNames, labelBins, listSingle);
652 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
654 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
656 //set list file as new current listfile
658 itTypes = outputTypes.find("list");
659 if (itTypes != outputTypes.end()) {
660 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
663 //set rabund file as new current rabundfile
664 itTypes = outputTypes.find("rabund");
665 if (itTypes != outputTypes.end()) {
666 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
669 //set sabund file as new current sabundfile
670 itTypes = outputTypes.find("sabund");
671 if (itTypes != outputTypes.end()) {
672 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
675 m->mothurOutEndLine();
676 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
677 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
678 m->mothurOutEndLine();
681 } //only process 0 merges
684 MPI_Barrier(MPI_COMM_WORLD);
689 catch(exception& e) {
690 m->errorOut(e, "ClusterSplitCommand", "execute");
694 //**********************************************************************************************************************
695 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
698 map<float, int> labelBin;
699 vector<float> orderFloat;
703 if (singleton != "none") {
706 m->openInputFile(singleton, in);
708 string firstCol, secondCol;
709 listSingle = new ListVector();
711 if (countfile != "") { m->getline(in); m->gobble(in); }
714 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
715 if (countfile == "") { listSingle->push_back(secondCol); }
716 else { listSingle->push_back(firstCol); }
720 m->mothurRemove(singleton);
722 numSingleBins = listSingle->getNumBins();
723 }else{ listSingle = NULL; numSingleBins = 0; }
725 //go through users set and make them floats so we can sort them
726 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
729 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
730 else if (*it == "unique") { temp = -1.0; }
732 if (temp <= cutoff) {
733 orderFloat.push_back(temp);
734 labelBin[temp] = numSingleBins; //initialize numbins
739 sort(orderFloat.begin(), orderFloat.end());
742 //get the list info from each file
743 for (int k = 0; k < listNames.size(); k++) {
745 if (m->control_pressed) {
746 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton); }
747 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
751 InputData* input = new InputData(listNames[k], "list");
752 ListVector* list = input->getListVector();
753 string lastLabel = list->getLabel();
755 string filledInList = listNames[k] + "filledInTemp";
757 m->openOutputFile(filledInList, outFilled);
759 //for each label needed
760 for(int l = 0; l < orderFloat.size(); l++){
763 if (orderFloat[l] == -1) { thisLabel = "unique"; }
764 else { thisLabel = toString(orderFloat[l], length-1); }
766 //this file has reached the end
768 list = input->getListVector(lastLabel, true);
769 }else{ //do you have the distance, or do you need to fill in
772 if (list->getLabel() == "unique") { labelFloat = -1.0; }
773 else { convert(list->getLabel(), labelFloat); }
775 //check for missing labels
776 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
777 //if its bigger get last label, otherwise keep it
779 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
781 lastLabel = list->getLabel();
785 list->setLabel(thisLabel);
786 list->print(outFilled);
789 labelBin[orderFloat[l]] += list->getNumBins();
793 list = input->getListVector();
796 if (list != NULL) { delete list; }
800 m->mothurRemove(listNames[k]);
801 rename(filledInList.c_str(), listNames[k].c_str());
806 catch(exception& e) {
807 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
811 //**********************************************************************************************************************
812 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
814 if (outputDir == "") { outputDir += m->hasPath(distfile); }
815 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
817 map<string, string> variables;
818 variables["[filename]"] = fileroot;
819 variables["[clustertag]"] = tag;
820 string sabundFileName = getOutputFileName("sabund", variables);
821 string rabundFileName = getOutputFileName("rabund", variables);
822 if (countfile != "") { variables["[tag2]"] = "unique_list"; }
823 string listFileName = getOutputFileName("list", variables);
825 if (countfile == "") {
826 m->openOutputFile(sabundFileName, outSabund);
827 m->openOutputFile(rabundFileName, outRabund);
828 outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
829 outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
832 m->openOutputFile(listFileName, outList);
833 outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
836 map<float, int>::iterator itLabel;
838 //for each label needed
839 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
842 if (itLabel->first == -1) { thisLabel = "unique"; }
843 else { thisLabel = toString(itLabel->first, length-1); }
845 outList << thisLabel << '\t' << itLabel->second << '\t';
847 RAbundVector* rabund = NULL;
848 if (countfile == "") {
849 rabund = new RAbundVector();
850 rabund->setLabel(thisLabel);
854 if (listSingle != NULL) {
855 for (int j = 0; j < listSingle->getNumBins(); j++) {
856 outList << listSingle->get(j) << '\t';
857 if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
861 //get the list info from each file
862 for (int k = 0; k < listNames.size(); k++) {
864 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } if (rabund != NULL) { delete rabund; } return 0; }
866 InputData* input = new InputData(listNames[k], "list");
867 ListVector* list = input->getListVector(thisLabel);
869 //this file has reached the end
870 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
872 for (int j = 0; j < list->getNumBins(); j++) {
873 outList << list->get(j) << '\t';
874 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
881 if (countfile == "") {
882 SAbundVector sabund = rabund->getSAbundVector();
883 sabund.print(outSabund);
884 rabund->print(outRabund);
888 if (rabund != NULL) { delete rabund; }
892 if (countfile == "") {
896 if (listSingle != NULL) { delete listSingle; }
898 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
902 catch(exception& e) {
903 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
908 //**********************************************************************************************************************
910 void ClusterSplitCommand::printData(ListVector* oldList){
912 string label = oldList->getLabel();
913 RAbundVector oldRAbund = oldList->getRAbundVector();
915 oldRAbund.setLabel(label);
916 if (m->isTrue(showabund)) {
917 oldRAbund.getSAbundVector().print(cout);
919 oldRAbund.print(outRabund);
920 oldRAbund.getSAbundVector().print(outSabund);
922 oldList->print(outList);
924 catch(exception& e) {
925 m->errorOut(e, "ClusterSplitCommand", "printData");
929 //**********************************************************************************************************************
930 vector<string> ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
933 vector<string> listFiles;
934 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
935 dividedNames.resize(processors);
937 //for each file group figure out which process will complete it
938 //want to divide the load intelligently so the big files are spread between processes
939 for (int i = 0; i < distName.size(); i++) {
941 int processToAssign = (i+1) % processors;
942 if (processToAssign == 0) { processToAssign = processors; }
944 dividedNames[(processToAssign-1)].push_back(distName[i]);
945 if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
948 //not lets reverse the order of ever other process, so we balance big files running with little ones
949 for (int i = 0; i < processors; i++) {
951 int remainder = ((i+1) % processors);
952 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
955 if (m->control_pressed) { return listFiles; }
957 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
961 //loop through and create all the processes you want
962 while (process != processors) {
966 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
970 vector<string> listFileNames = cluster(dividedNames[process], labels);
972 //write out names to file
973 string filename = toString(getpid()) + ".temp";
975 m->openOutputFile(filename, out);
977 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
982 filename = toString(getpid()) + ".temp.labels";
983 m->openOutputFile(filename, outLabels);
985 outLabels << cutoff << endl;
986 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
987 outLabels << (*it) << endl;
993 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
994 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1000 listFiles = cluster(dividedNames[0], labels);
1002 //force parent to wait until all the processes are done
1003 for (int i=0;i< processIDS.size();i++) {
1004 int temp = processIDS[i];
1008 //get list of list file names from each process
1009 for(int i=0;i<processIDS.size();i++){
1010 string filename = toString(processIDS[i]) + ".temp";
1012 m->openInputFile(filename, in);
1014 in >> tag; m->gobble(in);
1018 in >> tempName; m->gobble(in);
1019 listFiles.push_back(tempName);
1022 m->mothurRemove((toString(processIDS[i]) + ".temp"));
1025 filename = toString(processIDS[i]) + ".temp.labels";
1027 m->openInputFile(filename, in2);
1030 in2 >> tempCutoff; m->gobble(in2);
1031 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1035 in2 >> tempName; m->gobble(in2);
1036 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1039 m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1045 //////////////////////////////////////////////////////////////////////////////////////////////////////
1046 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
1047 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1048 //Taking advantage of shared memory to allow both threads to add labels.
1049 //////////////////////////////////////////////////////////////////////////////////////////////////////
1051 vector<clusterData*> pDataArray;
1052 DWORD dwThreadIdArray[processors-1];
1053 HANDLE hThreadArray[processors-1];
1055 //Create processor worker threads.
1056 for( int i=1; i<processors; i++ ){
1057 // Allocate memory for thread data.
1058 clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1059 pDataArray.push_back(tempCluster);
1060 processIDS.push_back(i);
1062 //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1063 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1064 hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
1069 listFiles = cluster(dividedNames[0], labels);
1071 //Wait until all threads have terminated.
1072 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1074 //Close all thread handles and free memory allocations.
1075 for(int i=0; i < pDataArray.size(); i++){
1077 tag = pDataArray[i]->tag;
1078 //get listfiles created
1079 for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1081 set<string>::iterator it;
1082 for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1084 if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1085 CloseHandle(hThreadArray[i]);
1086 delete pDataArray[i];
1094 catch(exception& e) {
1095 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1099 //**********************************************************************************************************************
1101 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1104 vector<string> listFileNames;
1105 double smallestCutoff = cutoff;
1107 //cluster each distance file
1108 for (int i = 0; i < distNames.size(); i++) {
1110 string thisNamefile = distNames[i].begin()->second;
1111 string thisDistFile = distNames[i].begin()->first;
1113 string listFileName = "";
1114 if (classic) { listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
1115 else { listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
1117 if (m->control_pressed) { //clean up
1118 for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); }
1119 listFileNames.clear(); return listFileNames;
1122 listFileNames.push_back(listFileName);
1125 cutoff = smallestCutoff;
1127 return listFileNames;
1130 catch(exception& e) {
1131 m->errorOut(e, "ClusterSplitCommand", "cluster");
1137 //**********************************************************************************************************************
1138 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1140 string listFileName = "";
1142 ListVector* list = NULL;
1144 RAbundVector* rabund = NULL;
1148 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1150 //output your files too
1152 cout << endl << "Reading " << thisDistFile << endl;
1156 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1158 //reads phylip file storing data in 2D vector, also fills list and rabund
1160 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1162 NameAssignment* nameMap = NULL;
1163 CountTable* ct = NULL;
1165 nameMap = new NameAssignment(thisNamefile);
1167 cluster->readPhylipFile(thisDistFile, nameMap);
1168 }else if (countfile != "") {
1169 ct = new CountTable();
1170 ct->readTable(thisNamefile);
1171 cluster->readPhylipFile(thisDistFile, ct);
1173 tag = cluster->getTag();
1175 if (m->control_pressed) { if(namefile != ""){ delete nameMap; }
1176 else { delete ct; } delete cluster; return 0; }
1178 list = cluster->getListVector();
1179 rabund = cluster->getRAbundVector();
1181 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1182 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1183 listFileName = fileroot+ tag + ".list";
1186 m->openOutputFile(fileroot+ tag + ".list", listFile);
1188 float previousDist = 0.00000;
1189 float rndPreviousDist = 0.00000;
1193 //output your files too
1195 cout << endl << "Clustering " << thisDistFile << endl;
1199 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1201 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1202 if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close(); if(namefile != ""){ delete nameMap; }
1203 else { delete ct; } return listFileName; }
1205 cluster->update(cutoff);
1207 float dist = cluster->getSmallDist();
1210 rndDist = m->ceilDist(dist, precision);
1212 rndDist = m->roundDist(dist, precision);
1215 if(previousDist <= 0.0000 && dist != previousDist){
1216 oldList.setLabel("unique");
1217 oldList.print(listFile);
1218 if (labels.count("unique") == 0) { labels.insert("unique"); }
1220 else if(rndDist != rndPreviousDist){
1221 oldList.setLabel(toString(rndPreviousDist, length-1));
1222 oldList.print(listFile);
1223 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1227 previousDist = dist;
1228 rndPreviousDist = rndDist;
1232 if(previousDist <= 0.0000){
1233 oldList.setLabel("unique");
1234 oldList.print(listFile);
1235 if (labels.count("unique") == 0) { labels.insert("unique"); }
1237 else if(rndPreviousDist<cutoff){
1238 oldList.setLabel(toString(rndPreviousDist, length-1));
1239 oldList.print(listFile);
1240 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1246 delete cluster; delete list; delete rabund;
1247 if(namefile != ""){ delete nameMap; }
1250 m->mothurRemove(thisDistFile);
1251 m->mothurRemove(thisNamefile);
1253 return listFileName;
1256 catch(exception& e) {
1257 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1262 //**********************************************************************************************************************
1263 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1265 string listFileName = "";
1267 Cluster* cluster = NULL;
1268 SparseDistanceMatrix* matrix = NULL;
1269 ListVector* list = NULL;
1271 RAbundVector* rabund = NULL;
1273 if (m->control_pressed) { return listFileName; }
1277 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1279 //output your files too
1281 cout << endl << "Reading " << thisDistFile << endl;
1285 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1287 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1288 read->setCutoff(cutoff);
1290 NameAssignment* nameMap = NULL;
1291 CountTable* ct = NULL;
1293 nameMap = new NameAssignment(thisNamefile);
1295 read->read(nameMap);
1296 }else if (countfile != "") {
1297 ct = new CountTable();
1298 ct->readTable(thisNamefile);
1300 }else { read->read(nameMap); }
1302 list = read->getListVector();
1304 matrix = read->getDMatrix();
1306 if(countfile != "") {
1307 rabund = new RAbundVector();
1308 createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1310 }else { rabund = new RAbundVector(list->getRAbundVector()); }
1312 delete read; read = NULL;
1313 if (namefile != "") { delete nameMap; nameMap = NULL; }
1317 //output your files too
1319 cout << endl << "Clustering " << thisDistFile << endl;
1323 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1326 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1327 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1328 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
1329 tag = cluster->getTag();
1331 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1332 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1335 m->openOutputFile(fileroot+ tag + ".list", listFile);
1337 listFileName = fileroot+ tag + ".list";
1339 float previousDist = 0.00000;
1340 float rndPreviousDist = 0.00000;
1346 double saveCutoff = cutoff;
1348 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1350 if (m->control_pressed) { //clean up
1351 delete matrix; delete list; delete cluster; delete rabund;
1353 m->mothurRemove(listFileName);
1354 return listFileName;
1357 cluster->update(saveCutoff);
1359 float dist = matrix->getSmallDist();
1362 rndDist = m->ceilDist(dist, precision);
1364 rndDist = m->roundDist(dist, precision);
1367 if(previousDist <= 0.0000 && dist != previousDist){
1368 oldList.setLabel("unique");
1369 oldList.print(listFile);
1370 if (labels.count("unique") == 0) { labels.insert("unique"); }
1372 else if(rndDist != rndPreviousDist){
1373 oldList.setLabel(toString(rndPreviousDist, length-1));
1374 oldList.print(listFile);
1375 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1378 previousDist = dist;
1379 rndPreviousDist = rndDist;
1384 if(previousDist <= 0.0000){
1385 oldList.setLabel("unique");
1386 oldList.print(listFile);
1387 if (labels.count("unique") == 0) { labels.insert("unique"); }
1389 else if(rndPreviousDist<cutoff){
1390 oldList.setLabel(toString(rndPreviousDist, length-1));
1391 oldList.print(listFile);
1392 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1395 delete matrix; delete list; delete cluster; delete rabund;
1396 matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1399 if (m->control_pressed) { //clean up
1400 m->mothurRemove(listFileName);
1401 return listFileName;
1404 m->mothurRemove(thisDistFile);
1405 m->mothurRemove(thisNamefile);
1407 if (saveCutoff != cutoff) {
1408 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1409 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1411 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1414 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1416 return listFileName;
1419 catch(exception& e) {
1420 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1424 //**********************************************************************************************************************
1426 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1431 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1436 string thisOutputDir = outputDir;
1437 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1438 map<string, string> variables;
1439 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
1440 string outputFileName = getOutputFileName("column", variables);
1441 m->mothurRemove(outputFileName);
1444 for (int i = 0; i < distNames.size(); i++) {
1445 if (m->control_pressed) { return 0; }
1447 string thisDistFile = distNames[i].begin()->first;
1449 m->appendFiles(thisDistFile, outputFileName);
1452 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1462 catch(exception& e) {
1463 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1467 //**********************************************************************************************************************
1468 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1470 rabund->setLabel(list->getLabel());
1471 for(int i = 0; i < list->getNumBins(); i++) {
1472 if (m->control_pressed) { break; }
1473 vector<string> binNames;
1474 string bin = list->get(i);
1475 m->splitAtComma(bin, binNames);
1477 for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]); }
1478 rabund->push_back(total);
1482 catch(exception& e) {
1483 m->errorOut(e, "ClusterCommand", "createRabund");
1488 //**********************************************************************************************************************