2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
13 //**********************************************************************************************************************
14 vector<string> ClusterSplitCommand::setParameters(){
16 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
17 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
18 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
19 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
20 CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
21 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
22 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
23 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "","",false,false,true); parameters.push_back(psplitmethod);
24 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
25 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
26 CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
27 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
28 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
29 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff);
30 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
31 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false); parameters.push_back(pmethod);
32 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
33 CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
34 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
35 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
37 vector<string> myArray;
38 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
42 m->errorOut(e, "ClusterSplitCommand", "setParameters");
46 //**********************************************************************************************************************
47 string ClusterSplitCommand::getHelpString(){
49 string helpString = "";
50 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
51 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
52 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
53 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n";
54 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
55 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n";
56 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
57 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
58 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
59 helpString += "The name parameter allows you to enter your name file. \n";
60 helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
61 helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
62 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
63 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
64 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
65 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
66 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
67 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
68 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
69 helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic. It is only valid with splitmethod=fasta. Default=f.\n";
71 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
73 helpString += "The cluster.split command should be in the following format: \n";
74 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
75 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
79 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
83 //**********************************************************************************************************************
84 string ClusterSplitCommand::getOutputPattern(string type) {
88 if (type == "list") { pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
89 else if (type == "rabund") { pattern = "[filename],[clustertag],rabund"; }
90 else if (type == "sabund") { pattern = "[filename],[clustertag],sabund"; }
91 else if (type == "column") { pattern = "[filename],dist"; }
92 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
97 m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
101 //**********************************************************************************************************************
102 ClusterSplitCommand::ClusterSplitCommand(){
104 abort = true; calledHelp = true;
106 vector<string> tempOutNames;
107 outputTypes["list"] = tempOutNames;
108 outputTypes["rabund"] = tempOutNames;
109 outputTypes["sabund"] = tempOutNames;
110 outputTypes["column"] = tempOutNames;
112 catch(exception& e) {
113 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
117 //**********************************************************************************************************************
118 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
119 ClusterSplitCommand::ClusterSplitCommand(string option) {
121 abort = false; calledHelp = false;
124 //allow user to run help
125 if(option == "help") { help(); abort = true; calledHelp = true; }
126 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
129 vector<string> myArray = setParameters();
131 OptionParser parser(option);
132 map<string,string> parameters = parser.getParameters();
134 ValidParameters validParameter("cluster.split");
136 //check to make sure all parameters are valid for command
137 map<string,string>::iterator it;
138 for (it = parameters.begin(); it != parameters.end(); it++) {
139 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
144 //initialize outputTypes
145 vector<string> tempOutNames;
146 outputTypes["list"] = tempOutNames;
147 outputTypes["rabund"] = tempOutNames;
148 outputTypes["sabund"] = tempOutNames;
149 outputTypes["column"] = tempOutNames;
151 //if the user changes the output directory command factory will send this info to us in the output parameter
152 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
154 //if the user changes the input directory command factory will send this info to us in the output parameter
155 string inputDir = validParameter.validFile(parameters, "inputdir", false);
156 if (inputDir == "not found"){ inputDir = ""; }
159 it = parameters.find("phylip");
160 //user has given a template file
161 if(it != parameters.end()){
162 path = m->hasPath(it->second);
163 //if the user has not given a path then, add inputdir. else leave path alone.
164 if (path == "") { parameters["phylip"] = inputDir + it->second; }
167 it = parameters.find("column");
168 //user has given a template file
169 if(it != parameters.end()){
170 path = m->hasPath(it->second);
171 //if the user has not given a path then, add inputdir. else leave path alone.
172 if (path == "") { parameters["column"] = inputDir + it->second; }
175 it = parameters.find("name");
176 //user has given a template file
177 if(it != parameters.end()){
178 path = m->hasPath(it->second);
179 //if the user has not given a path then, add inputdir. else leave path alone.
180 if (path == "") { parameters["name"] = inputDir + it->second; }
183 it = parameters.find("taxonomy");
184 //user has given a template file
185 if(it != parameters.end()){
186 path = m->hasPath(it->second);
187 //if the user has not given a path then, add inputdir. else leave path alone.
188 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
191 it = parameters.find("fasta");
192 //user has given a template file
193 if(it != parameters.end()){
194 path = m->hasPath(it->second);
195 //if the user has not given a path then, add inputdir. else leave path alone.
196 if (path == "") { parameters["fasta"] = inputDir + it->second; }
199 it = parameters.find("count");
200 //user has given a template file
201 if(it != parameters.end()){
202 path = m->hasPath(it->second);
203 //if the user has not given a path then, add inputdir. else leave path alone.
204 if (path == "") { parameters["count"] = inputDir + it->second; }
208 //check for required parameters
209 phylipfile = validParameter.validFile(parameters, "phylip", true);
210 if (phylipfile == "not open") { abort = true; }
211 else if (phylipfile == "not found") { phylipfile = ""; }
212 else { distfile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
214 columnfile = validParameter.validFile(parameters, "column", true);
215 if (columnfile == "not open") { abort = true; }
216 else if (columnfile == "not found") { columnfile = ""; }
217 else { distfile = columnfile; format = "column"; m->setColumnFile(columnfile); }
219 namefile = validParameter.validFile(parameters, "name", true);
220 if (namefile == "not open") { abort = true; namefile = "";}
221 else if (namefile == "not found") { namefile = ""; }
222 else { m->setNameFile(namefile); }
224 countfile = validParameter.validFile(parameters, "count", true);
225 if (countfile == "not open") { abort = true; countfile = "";}
226 else if (countfile == "not found") { countfile = ""; }
227 else { m->setCountTableFile(countfile); }
229 fastafile = validParameter.validFile(parameters, "fasta", true);
230 if (fastafile == "not open") { abort = true; }
231 else if (fastafile == "not found") { fastafile = ""; }
232 else { distfile = fastafile; splitmethod = "fasta"; m->setFastaFile(fastafile); }
234 taxFile = validParameter.validFile(parameters, "taxonomy", true);
235 if (taxFile == "not open") { taxFile = ""; abort = true; }
236 else if (taxFile == "not found") { taxFile = ""; }
237 else { m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
239 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
240 //is there are current file available for either of these?
241 //give priority to column, then phylip, then fasta
242 columnfile = m->getColumnFile();
243 if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
245 phylipfile = m->getPhylipFile();
246 if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
248 fastafile = m->getFastaFile();
249 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
251 m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
257 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
259 if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
261 if (columnfile != "") {
262 if ((namefile == "") && (countfile == "")) {
263 namefile = m->getNameFile();
264 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
266 countfile = m->getCountTableFile();
267 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
269 m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
276 if (fastafile != "") {
278 taxFile = m->getTaxonomyFile();
279 if (taxFile != "") { m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
281 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
286 if ((namefile == "") && (countfile == "")) {
287 namefile = m->getNameFile();
288 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
290 countfile = m->getCountTableFile();
291 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
293 m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine();
300 //check for optional parameter and set defaults
301 // ...at some point should added some additional type checking...
302 //get user cutoff and precision or use defaults
304 temp = validParameter.validFile(parameters, "precision", false);
305 if (temp == "not found") { temp = "100"; }
306 //saves precision legnth for formatting below
307 length = temp.length();
308 m->mothurConvert(temp, precision);
310 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
311 hard = m->isTrue(temp);
313 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
314 large = m->isTrue(temp);
316 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
317 m->setProcessors(temp);
318 m->mothurConvert(temp, processors);
320 temp = validParameter.validFile(parameters, "splitmethod", false);
321 if ((splitmethod != "fasta") && (splitmethod != "classify")) {
322 if (temp == "not found") { splitmethod = "distance"; }
323 else { splitmethod = temp; }
326 temp = validParameter.validFile(parameters, "classic", false); if (temp == "not found") { temp = "F"; }
327 classic = m->isTrue(temp);
329 if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
331 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.25"; }
332 m->mothurConvert(temp, cutoff);
333 cutoff += (5 / (precision * 10.0));
335 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "3"; }
336 m->mothurConvert(temp, taxLevelCutoff);
338 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "average"; }
340 if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
341 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
343 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
344 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
346 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
348 showabund = validParameter.validFile(parameters, "showabund", false);
349 if (showabund == "not found") { showabund = "T"; }
351 temp = validParameter.validFile(parameters, "cluster", false); if (temp == "not found") { temp = "T"; }
352 runCluster = m->isTrue(temp);
354 timing = validParameter.validFile(parameters, "timing", false);
355 if (timing == "not found") { timing = "F"; }
359 catch(exception& e) {
360 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
365 //**********************************************************************************************************************
367 int ClusterSplitCommand::execute(){
370 if (abort == true) { if (calledHelp) { return 0; } return 2; }
373 vector<string> listFileNames;
375 string singletonName = "";
376 double saveCutoff = cutoff;
378 //****************** file prep work ******************************//
383 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
384 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
386 if (pid == 0) { //only process 0 converts and splits
390 //if user gave a phylip file convert to column file
391 if (format == "phylip") {
393 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
395 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
397 NameAssignment* nameMap = NULL;
398 convert->setFormat("phylip");
399 convert->read(nameMap);
401 if (m->control_pressed) { delete convert; return 0; }
403 distfile = convert->getOutputFile();
405 //if no names file given with phylip file, create it
406 ListVector* listToMakeNameFile = convert->getListVector();
407 if ((namefile == "") && (countfile == "")) { //you need to make a namefile for split matrix
409 namefile = phylipfile + ".names";
410 m->openOutputFile(namefile, out);
411 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
412 string bin = listToMakeNameFile->get(i);
413 out << bin << '\t' << bin << endl;
417 delete listToMakeNameFile;
420 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
422 if (m->control_pressed) { return 0; }
425 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
427 //split matrix into non-overlapping groups
429 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large); }
430 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large); }
431 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir); }
432 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
436 if (m->control_pressed) { delete split; return 0; }
438 singletonName = split->getSingletonNames();
439 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
442 if (m->debug) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); }
444 //output a merged distance file
445 //if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
447 if (m->control_pressed) { return 0; }
449 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
454 m->mothurOutEndLine();
455 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
456 for (int i = 0; i < distName.size(); i++) { m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine(); }
457 m->mothurOutEndLine();
462 //****************** break up files between processes and cluster each file set ******************************//
464 ////you are process 0 from above////
466 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
467 dividedNames.resize(processors);
469 //for each file group figure out which process will complete it
470 //want to divide the load intelligently so the big files are spread between processes
471 for (int i = 0; i < distName.size(); i++) {
472 int processToAssign = (i+1) % processors;
473 if (processToAssign == 0) { processToAssign = processors; }
475 dividedNames[(processToAssign-1)].push_back(distName[i]);
478 //not lets reverse the order of ever other process, so we balance big files running with little ones
479 for (int i = 0; i < processors; i++) {
480 int remainder = ((i+1) % processors);
481 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
485 //send each child the list of files it needs to process
486 for(int i = 1; i < processors; i++) {
487 //send number of file pairs
488 int num = dividedNames[i].size();
489 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
491 for (int j = 0; j < num; j++) { //send filenames to process i
492 char tempDistFileName[1024];
493 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
494 int lengthDist = (dividedNames[i][j].begin()->first).length();
496 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
497 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
499 char tempNameFileName[1024];
500 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
501 int lengthName = (dividedNames[i][j].begin()->second).length();
503 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
504 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
509 listFileNames = cluster(dividedNames[0], labels);
511 //receive the other processes info
512 for(int i = 1; i < processors; i++) {
513 int num = dividedNames[i].size();
516 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
517 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
519 //send list filenames to root process
520 for (int j = 0; j < num; j++) {
522 char tempListFileName[1024];
524 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
525 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
527 string myListFileName = tempListFileName;
528 myListFileName = myListFileName.substr(0, lengthList);
530 listFileNames.push_back(myListFileName);
533 //send Labels to root process
535 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
537 for (int j = 0; j < numLabels; j++) {
541 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
542 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
544 string myLabel = tempLabel;
545 myLabel = myLabel.substr(0, lengthLabel);
547 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
551 }else { //you are a child process
552 vector < map<string, string> > myNames;
554 //recieve the files you need to process
555 //receive number of file pairs
557 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
561 for (int j = 0; j < num; j++) { //receive filenames to process
563 char tempDistFileName[1024];
565 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
566 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
568 string myDistFileName = tempDistFileName;
569 myDistFileName = myDistFileName.substr(0, lengthDist);
572 char tempNameFileName[1024];
574 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
575 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
577 string myNameFileName = tempNameFileName;
578 myNameFileName = myNameFileName.substr(0, lengthName);
581 myNames[j][myDistFileName] = myNameFileName;
585 listFileNames = cluster(myNames, labels);
588 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
590 //send list filenames to root process
591 for (int j = 0; j < num; j++) {
592 char tempListFileName[1024];
593 strcpy(tempListFileName, listFileNames[j].c_str());
594 int lengthList = listFileNames[j].length();
596 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
597 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
600 //send Labels to root process
601 int numLabels = labels.size();
602 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
604 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
606 strcpy(tempLabel, (*it).c_str());
607 int lengthLabel = (*it).length();
609 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
610 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
615 MPI_Barrier(MPI_COMM_WORLD);
618 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
620 if (processors > distName.size()) { processors = distName.size(); }
622 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
624 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
626 listFileNames = createProcesses(distName, labels);
629 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
632 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
634 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
636 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
638 //****************** merge list file and create rabund and sabund files ******************************//
640 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
643 if (pid == 0) { //only process 0 merges
646 ListVector* listSingle;
647 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
649 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
651 mergeLists(listFileNames, labelBins, listSingle);
653 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
655 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
657 //set list file as new current listfile
659 itTypes = outputTypes.find("list");
660 if (itTypes != outputTypes.end()) {
661 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
664 //set rabund file as new current rabundfile
665 itTypes = outputTypes.find("rabund");
666 if (itTypes != outputTypes.end()) {
667 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
670 //set sabund file as new current sabundfile
671 itTypes = outputTypes.find("sabund");
672 if (itTypes != outputTypes.end()) {
673 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
676 m->mothurOutEndLine();
677 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
678 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
679 m->mothurOutEndLine();
682 } //only process 0 merges
685 MPI_Barrier(MPI_COMM_WORLD);
690 catch(exception& e) {
691 m->errorOut(e, "ClusterSplitCommand", "execute");
695 //**********************************************************************************************************************
696 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
699 map<float, int> labelBin;
700 vector<float> orderFloat;
704 if (singleton != "none") {
707 m->openInputFile(singleton, in);
709 string firstCol, secondCol;
710 listSingle = new ListVector();
712 if (countfile != "") { m->getline(in); m->gobble(in); }
715 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
716 if (countfile == "") { listSingle->push_back(secondCol); }
717 else { listSingle->push_back(firstCol); }
721 m->mothurRemove(singleton);
723 numSingleBins = listSingle->getNumBins();
724 }else{ listSingle = NULL; numSingleBins = 0; }
726 //go through users set and make them floats so we can sort them
727 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
730 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
731 else if (*it == "unique") { temp = -1.0; }
733 if (temp <= cutoff) {
734 orderFloat.push_back(temp);
735 labelBin[temp] = numSingleBins; //initialize numbins
740 sort(orderFloat.begin(), orderFloat.end());
743 //get the list info from each file
744 for (int k = 0; k < listNames.size(); k++) {
746 if (m->control_pressed) {
747 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton); }
748 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
752 InputData* input = new InputData(listNames[k], "list");
753 ListVector* list = input->getListVector();
754 string lastLabel = list->getLabel();
756 string filledInList = listNames[k] + "filledInTemp";
758 m->openOutputFile(filledInList, outFilled);
760 //for each label needed
761 for(int l = 0; l < orderFloat.size(); l++){
764 if (orderFloat[l] == -1) { thisLabel = "unique"; }
765 else { thisLabel = toString(orderFloat[l], length-1); }
767 //this file has reached the end
769 list = input->getListVector(lastLabel, true);
770 }else{ //do you have the distance, or do you need to fill in
773 if (list->getLabel() == "unique") { labelFloat = -1.0; }
774 else { convert(list->getLabel(), labelFloat); }
776 //check for missing labels
777 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
778 //if its bigger get last label, otherwise keep it
780 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
782 lastLabel = list->getLabel();
786 list->setLabel(thisLabel);
787 list->print(outFilled);
790 labelBin[orderFloat[l]] += list->getNumBins();
794 list = input->getListVector();
797 if (list != NULL) { delete list; }
801 m->mothurRemove(listNames[k]);
802 rename(filledInList.c_str(), listNames[k].c_str());
807 catch(exception& e) {
808 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
812 //**********************************************************************************************************************
813 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
815 if (outputDir == "") { outputDir += m->hasPath(distfile); }
816 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
818 map<string, string> variables;
819 variables["[filename]"] = fileroot;
820 variables["[clustertag]"] = tag;
821 string sabundFileName = getOutputFileName("sabund", variables);
822 string rabundFileName = getOutputFileName("rabund", variables);
823 if (countfile != "") { variables["[tag2]"] = "unique_list"; }
824 string listFileName = getOutputFileName("list", variables);
826 if (countfile == "") {
827 m->openOutputFile(sabundFileName, outSabund);
828 m->openOutputFile(rabundFileName, outRabund);
829 outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
830 outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
833 m->openOutputFile(listFileName, outList);
834 outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
837 map<float, int>::iterator itLabel;
839 //for each label needed
840 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
843 if (itLabel->first == -1) { thisLabel = "unique"; }
844 else { thisLabel = toString(itLabel->first, length-1); }
846 outList << thisLabel << '\t' << itLabel->second << '\t';
848 RAbundVector* rabund = NULL;
849 if (countfile == "") {
850 rabund = new RAbundVector();
851 rabund->setLabel(thisLabel);
855 if (listSingle != NULL) {
856 for (int j = 0; j < listSingle->getNumBins(); j++) {
857 outList << listSingle->get(j) << '\t';
858 if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
862 //get the list info from each file
863 for (int k = 0; k < listNames.size(); k++) {
865 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } if (rabund != NULL) { delete rabund; } return 0; }
867 InputData* input = new InputData(listNames[k], "list");
868 ListVector* list = input->getListVector(thisLabel);
870 //this file has reached the end
871 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
873 for (int j = 0; j < list->getNumBins(); j++) {
874 outList << list->get(j) << '\t';
875 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
882 if (countfile == "") {
883 SAbundVector sabund = rabund->getSAbundVector();
884 sabund.print(outSabund);
885 rabund->print(outRabund);
889 if (rabund != NULL) { delete rabund; }
893 if (countfile == "") {
897 if (listSingle != NULL) { delete listSingle; }
899 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
903 catch(exception& e) {
904 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
909 //**********************************************************************************************************************
911 void ClusterSplitCommand::printData(ListVector* oldList){
913 string label = oldList->getLabel();
914 RAbundVector oldRAbund = oldList->getRAbundVector();
916 oldRAbund.setLabel(label);
917 if (m->isTrue(showabund)) {
918 oldRAbund.getSAbundVector().print(cout);
920 oldRAbund.print(outRabund);
921 oldRAbund.getSAbundVector().print(outSabund);
923 oldList->print(outList);
925 catch(exception& e) {
926 m->errorOut(e, "ClusterSplitCommand", "printData");
930 //**********************************************************************************************************************
931 vector<string> ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
934 vector<string> listFiles;
935 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
936 dividedNames.resize(processors);
938 //for each file group figure out which process will complete it
939 //want to divide the load intelligently so the big files are spread between processes
940 for (int i = 0; i < distName.size(); i++) {
942 int processToAssign = (i+1) % processors;
943 if (processToAssign == 0) { processToAssign = processors; }
945 dividedNames[(processToAssign-1)].push_back(distName[i]);
946 if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
949 //now lets reverse the order of ever other process, so we balance big files running with little ones
950 for (int i = 0; i < processors; i++) {
952 int remainder = ((i+1) % processors);
953 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
956 if (m->control_pressed) { return listFiles; }
958 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
962 //loop through and create all the processes you want
963 while (process != processors) {
967 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
971 vector<string> listFileNames = cluster(dividedNames[process], labels);
973 //write out names to file
974 string filename = toString(getpid()) + ".temp";
976 m->openOutputFile(filename, out);
978 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
983 filename = toString(getpid()) + ".temp.labels";
984 m->openOutputFile(filename, outLabels);
986 outLabels << cutoff << endl;
987 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
988 outLabels << (*it) << endl;
994 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
995 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1001 listFiles = cluster(dividedNames[0], labels);
1003 //force parent to wait until all the processes are done
1004 for (int i=0;i< processIDS.size();i++) {
1005 int temp = processIDS[i];
1009 //get list of list file names from each process
1010 for(int i=0;i<processIDS.size();i++){
1011 string filename = toString(processIDS[i]) + ".temp";
1013 m->openInputFile(filename, in);
1015 in >> tag; m->gobble(in);
1019 in >> tempName; m->gobble(in);
1020 listFiles.push_back(tempName);
1023 m->mothurRemove((toString(processIDS[i]) + ".temp"));
1026 filename = toString(processIDS[i]) + ".temp.labels";
1028 m->openInputFile(filename, in2);
1031 in2 >> tempCutoff; m->gobble(in2);
1032 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1036 in2 >> tempName; m->gobble(in2);
1037 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1040 m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1046 //////////////////////////////////////////////////////////////////////////////////////////////////////
1047 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
1048 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1049 //Taking advantage of shared memory to allow both threads to add labels.
1050 //////////////////////////////////////////////////////////////////////////////////////////////////////
1052 vector<clusterData*> pDataArray;
1053 DWORD dwThreadIdArray[processors-1];
1054 HANDLE hThreadArray[processors-1];
1056 //Create processor worker threads.
1057 for( int i=1; i<processors; i++ ){
1058 // Allocate memory for thread data.
1059 clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1060 pDataArray.push_back(tempCluster);
1061 processIDS.push_back(i);
1063 //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1064 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1065 hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
1070 listFiles = cluster(dividedNames[0], labels);
1072 //Wait until all threads have terminated.
1073 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1075 //Close all thread handles and free memory allocations.
1076 for(int i=0; i < pDataArray.size(); i++){
1078 tag = pDataArray[i]->tag;
1079 //get listfiles created
1080 for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1082 set<string>::iterator it;
1083 for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1085 if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1086 CloseHandle(hThreadArray[i]);
1087 delete pDataArray[i];
1095 catch(exception& e) {
1096 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1100 //**********************************************************************************************************************
1102 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1105 vector<string> listFileNames;
1106 double smallestCutoff = cutoff;
1108 //cluster each distance file
1109 for (int i = 0; i < distNames.size(); i++) {
1111 string thisNamefile = distNames[i].begin()->second;
1112 string thisDistFile = distNames[i].begin()->first;
1114 string listFileName = "";
1115 if (classic) { listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
1116 else { listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
1118 if (m->control_pressed) { //clean up
1119 for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); }
1120 listFileNames.clear(); return listFileNames;
1123 listFileNames.push_back(listFileName);
1126 cutoff = smallestCutoff;
1128 return listFileNames;
1131 catch(exception& e) {
1132 m->errorOut(e, "ClusterSplitCommand", "cluster");
1138 //**********************************************************************************************************************
1139 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1141 string listFileName = "";
1143 ListVector* list = NULL;
1145 RAbundVector* rabund = NULL;
1149 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1151 //output your files too
1153 cout << endl << "Reading " << thisDistFile << endl;
1157 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1159 //reads phylip file storing data in 2D vector, also fills list and rabund
1161 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1163 NameAssignment* nameMap = NULL;
1164 CountTable* ct = NULL;
1166 nameMap = new NameAssignment(thisNamefile);
1168 cluster->readPhylipFile(thisDistFile, nameMap);
1169 }else if (countfile != "") {
1170 ct = new CountTable();
1171 ct->readTable(thisNamefile);
1172 cluster->readPhylipFile(thisDistFile, ct);
1174 tag = cluster->getTag();
1176 if (m->control_pressed) { if(namefile != ""){ delete nameMap; }
1177 else { delete ct; } delete cluster; return 0; }
1179 list = cluster->getListVector();
1180 rabund = cluster->getRAbundVector();
1182 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1183 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1184 listFileName = fileroot+ tag + ".list";
1187 m->openOutputFile(fileroot+ tag + ".list", listFile);
1189 float previousDist = 0.00000;
1190 float rndPreviousDist = 0.00000;
1194 //output your files too
1196 cout << endl << "Clustering " << thisDistFile << endl;
1200 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1202 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1203 if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close(); if(namefile != ""){ delete nameMap; }
1204 else { delete ct; } return listFileName; }
1206 cluster->update(cutoff);
1208 float dist = cluster->getSmallDist();
1211 rndDist = m->ceilDist(dist, precision);
1213 rndDist = m->roundDist(dist, precision);
1216 if(previousDist <= 0.0000 && dist != previousDist){
1217 oldList.setLabel("unique");
1218 oldList.print(listFile);
1219 if (labels.count("unique") == 0) { labels.insert("unique"); }
1221 else if(rndDist != rndPreviousDist){
1222 oldList.setLabel(toString(rndPreviousDist, length-1));
1223 oldList.print(listFile);
1224 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1228 previousDist = dist;
1229 rndPreviousDist = rndDist;
1233 if(previousDist <= 0.0000){
1234 oldList.setLabel("unique");
1235 oldList.print(listFile);
1236 if (labels.count("unique") == 0) { labels.insert("unique"); }
1238 else if(rndPreviousDist<cutoff){
1239 oldList.setLabel(toString(rndPreviousDist, length-1));
1240 oldList.print(listFile);
1241 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1247 delete cluster; delete list; delete rabund;
1248 if(namefile != ""){ delete nameMap; }
1251 m->mothurRemove(thisDistFile);
1252 m->mothurRemove(thisNamefile);
1254 return listFileName;
1257 catch(exception& e) {
1258 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1263 //**********************************************************************************************************************
1264 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1266 string listFileName = "";
1268 Cluster* cluster = NULL;
1269 SparseDistanceMatrix* matrix = NULL;
1270 ListVector* list = NULL;
1272 RAbundVector* rabund = NULL;
1274 if (m->control_pressed) { return listFileName; }
1278 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1280 //output your files too
1282 cout << endl << "Reading " << thisDistFile << endl;
1286 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1288 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1289 read->setCutoff(cutoff);
1291 NameAssignment* nameMap = NULL;
1292 CountTable* ct = NULL;
1294 nameMap = new NameAssignment(thisNamefile);
1296 read->read(nameMap);
1297 }else if (countfile != "") {
1298 ct = new CountTable();
1299 ct->readTable(thisNamefile);
1301 }else { read->read(nameMap); }
1303 list = read->getListVector();
1305 matrix = read->getDMatrix();
1307 if(countfile != "") {
1308 rabund = new RAbundVector();
1309 createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1311 }else { rabund = new RAbundVector(list->getRAbundVector()); }
1313 delete read; read = NULL;
1314 if (namefile != "") { delete nameMap; nameMap = NULL; }
1318 //output your files too
1320 cout << endl << "Clustering " << thisDistFile << endl;
1324 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1327 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1328 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1329 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
1330 tag = cluster->getTag();
1332 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1333 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1336 m->openOutputFile(fileroot+ tag + ".list", listFile);
1338 listFileName = fileroot+ tag + ".list";
1340 float previousDist = 0.00000;
1341 float rndPreviousDist = 0.00000;
1347 double saveCutoff = cutoff;
1349 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1351 if (m->control_pressed) { //clean up
1352 delete matrix; delete list; delete cluster; delete rabund;
1354 m->mothurRemove(listFileName);
1355 return listFileName;
1358 cluster->update(saveCutoff);
1360 float dist = matrix->getSmallDist();
1363 rndDist = m->ceilDist(dist, precision);
1365 rndDist = m->roundDist(dist, precision);
1368 if(previousDist <= 0.0000 && dist != previousDist){
1369 oldList.setLabel("unique");
1370 oldList.print(listFile);
1371 if (labels.count("unique") == 0) { labels.insert("unique"); }
1373 else if(rndDist != rndPreviousDist){
1374 oldList.setLabel(toString(rndPreviousDist, length-1));
1375 oldList.print(listFile);
1376 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1379 previousDist = dist;
1380 rndPreviousDist = rndDist;
1385 if(previousDist <= 0.0000){
1386 oldList.setLabel("unique");
1387 oldList.print(listFile);
1388 if (labels.count("unique") == 0) { labels.insert("unique"); }
1390 else if(rndPreviousDist<cutoff){
1391 oldList.setLabel(toString(rndPreviousDist, length-1));
1392 oldList.print(listFile);
1393 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1396 delete matrix; delete list; delete cluster; delete rabund;
1397 matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1400 if (m->control_pressed) { //clean up
1401 m->mothurRemove(listFileName);
1402 return listFileName;
1405 m->mothurRemove(thisDistFile);
1406 m->mothurRemove(thisNamefile);
1408 if (saveCutoff != cutoff) {
1409 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1410 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1412 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1415 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1417 return listFileName;
1420 catch(exception& e) {
1421 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1425 //**********************************************************************************************************************
1427 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1432 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1437 string thisOutputDir = outputDir;
1438 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1439 map<string, string> variables;
1440 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
1441 string outputFileName = getOutputFileName("column", variables);
1442 m->mothurRemove(outputFileName);
1445 for (int i = 0; i < distNames.size(); i++) {
1446 if (m->control_pressed) { return 0; }
1448 string thisDistFile = distNames[i].begin()->first;
1450 m->appendFiles(thisDistFile, outputFileName);
1453 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1463 catch(exception& e) {
1464 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1468 //**********************************************************************************************************************
1469 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1471 rabund->setLabel(list->getLabel());
1472 for(int i = 0; i < list->getNumBins(); i++) {
1473 if (m->control_pressed) { break; }
1474 vector<string> binNames;
1475 string bin = list->get(i);
1476 m->splitAtComma(bin, binNames);
1478 for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]); }
1479 rabund->push_back(total);
1483 catch(exception& e) {
1484 m->errorOut(e, "ClusterCommand", "createRabund");
1489 //**********************************************************************************************************************