2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
13 //**********************************************************************************************************************
14 vector<string> ClusterSplitCommand::setParameters(){
16 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
17 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
18 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
19 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
20 CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
21 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
22 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
23 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "","",false,false,true); parameters.push_back(psplitmethod);
24 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
25 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
26 CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
27 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
28 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
29 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff);
30 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
31 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false); parameters.push_back(pmethod);
32 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
33 CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
34 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
35 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
37 vector<string> myArray;
38 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
42 m->errorOut(e, "ClusterSplitCommand", "setParameters");
46 //**********************************************************************************************************************
47 string ClusterSplitCommand::getHelpString(){
49 string helpString = "";
50 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
51 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
52 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
53 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n";
54 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
55 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n";
56 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
57 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
58 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
59 helpString += "The name parameter allows you to enter your name file. \n";
60 helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
61 helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
62 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
63 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
64 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
65 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
66 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
67 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
68 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
69 helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic. It is only valid with splitmethod=fasta. Default=f.\n";
71 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
73 helpString += "The cluster.split command should be in the following format: \n";
74 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
75 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
79 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
83 //**********************************************************************************************************************
84 string ClusterSplitCommand::getOutputPattern(string type) {
88 if (type == "list") { pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
89 else if (type == "rabund") { pattern = "[filename],[clustertag],rabund"; }
90 else if (type == "sabund") { pattern = "[filename],[clustertag],sabund"; }
91 else if (type == "column") { pattern = "[filename],dist"; }
92 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
97 m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
101 //**********************************************************************************************************************
102 ClusterSplitCommand::ClusterSplitCommand(){
104 abort = true; calledHelp = true;
106 vector<string> tempOutNames;
107 outputTypes["list"] = tempOutNames;
108 outputTypes["rabund"] = tempOutNames;
109 outputTypes["sabund"] = tempOutNames;
110 outputTypes["column"] = tempOutNames;
112 catch(exception& e) {
113 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
117 //**********************************************************************************************************************
118 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
119 ClusterSplitCommand::ClusterSplitCommand(string option) {
121 abort = false; calledHelp = false;
124 //allow user to run help
125 if(option == "help") { help(); abort = true; calledHelp = true; }
126 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
129 vector<string> myArray = setParameters();
131 OptionParser parser(option);
132 map<string,string> parameters = parser.getParameters();
134 ValidParameters validParameter("cluster.split");
136 //check to make sure all parameters are valid for command
137 map<string,string>::iterator it;
138 for (it = parameters.begin(); it != parameters.end(); it++) {
139 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
144 //initialize outputTypes
145 vector<string> tempOutNames;
146 outputTypes["list"] = tempOutNames;
147 outputTypes["rabund"] = tempOutNames;
148 outputTypes["sabund"] = tempOutNames;
149 outputTypes["column"] = tempOutNames;
151 //if the user changes the output directory command factory will send this info to us in the output parameter
152 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
154 //if the user changes the input directory command factory will send this info to us in the output parameter
155 string inputDir = validParameter.validFile(parameters, "inputdir", false);
156 if (inputDir == "not found"){ inputDir = ""; }
159 it = parameters.find("phylip");
160 //user has given a template file
161 if(it != parameters.end()){
162 path = m->hasPath(it->second);
163 //if the user has not given a path then, add inputdir. else leave path alone.
164 if (path == "") { parameters["phylip"] = inputDir + it->second; }
167 it = parameters.find("column");
168 //user has given a template file
169 if(it != parameters.end()){
170 path = m->hasPath(it->second);
171 //if the user has not given a path then, add inputdir. else leave path alone.
172 if (path == "") { parameters["column"] = inputDir + it->second; }
175 it = parameters.find("name");
176 //user has given a template file
177 if(it != parameters.end()){
178 path = m->hasPath(it->second);
179 //if the user has not given a path then, add inputdir. else leave path alone.
180 if (path == "") { parameters["name"] = inputDir + it->second; }
183 it = parameters.find("taxonomy");
184 //user has given a template file
185 if(it != parameters.end()){
186 path = m->hasPath(it->second);
187 //if the user has not given a path then, add inputdir. else leave path alone.
188 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
191 it = parameters.find("fasta");
192 //user has given a template file
193 if(it != parameters.end()){
194 path = m->hasPath(it->second);
195 //if the user has not given a path then, add inputdir. else leave path alone.
196 if (path == "") { parameters["fasta"] = inputDir + it->second; }
199 it = parameters.find("count");
200 //user has given a template file
201 if(it != parameters.end()){
202 path = m->hasPath(it->second);
203 //if the user has not given a path then, add inputdir. else leave path alone.
204 if (path == "") { parameters["count"] = inputDir + it->second; }
208 //check for required parameters
209 phylipfile = validParameter.validFile(parameters, "phylip", true);
210 if (phylipfile == "not open") { abort = true; }
211 else if (phylipfile == "not found") { phylipfile = ""; }
212 else { distfile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
214 columnfile = validParameter.validFile(parameters, "column", true);
215 if (columnfile == "not open") { abort = true; }
216 else if (columnfile == "not found") { columnfile = ""; }
217 else { distfile = columnfile; format = "column"; m->setColumnFile(columnfile); }
219 namefile = validParameter.validFile(parameters, "name", true);
220 if (namefile == "not open") { abort = true; namefile = "";}
221 else if (namefile == "not found") { namefile = ""; }
222 else { m->setNameFile(namefile); }
224 countfile = validParameter.validFile(parameters, "count", true);
225 if (countfile == "not open") { abort = true; countfile = "";}
226 else if (countfile == "not found") { countfile = ""; }
227 else { m->setCountTableFile(countfile); }
229 fastafile = validParameter.validFile(parameters, "fasta", true);
230 if (fastafile == "not open") { abort = true; }
231 else if (fastafile == "not found") { fastafile = ""; }
232 else { distfile = fastafile; splitmethod = "fasta"; m->setFastaFile(fastafile); }
234 taxFile = validParameter.validFile(parameters, "taxonomy", true);
235 if (taxFile == "not open") { taxFile = ""; abort = true; }
236 else if (taxFile == "not found") { taxFile = ""; }
237 else { m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
239 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
240 //is there are current file available for either of these?
241 //give priority to column, then phylip, then fasta
242 columnfile = m->getColumnFile();
243 if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
245 phylipfile = m->getPhylipFile();
246 if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
248 fastafile = m->getFastaFile();
249 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
251 m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
257 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
259 if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
261 if (columnfile != "") {
262 if ((namefile == "") && (countfile == "")) {
263 namefile = m->getNameFile();
264 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
266 countfile = m->getCountTableFile();
267 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
269 m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
276 if (fastafile != "") {
278 taxFile = m->getTaxonomyFile();
279 if (taxFile != "") { m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
281 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
286 if ((namefile == "") && (countfile == "")) {
287 namefile = m->getNameFile();
288 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
290 countfile = m->getCountTableFile();
291 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
293 m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine();
300 //check for optional parameter and set defaults
301 // ...at some point should added some additional type checking...
302 //get user cutoff and precision or use defaults
304 temp = validParameter.validFile(parameters, "precision", false);
305 if (temp == "not found") { temp = "100"; }
306 //saves precision legnth for formatting below
307 length = temp.length();
308 m->mothurConvert(temp, precision);
310 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
311 hard = m->isTrue(temp);
313 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
314 large = m->isTrue(temp);
316 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
317 m->setProcessors(temp);
318 m->mothurConvert(temp, processors);
320 temp = validParameter.validFile(parameters, "splitmethod", false);
321 if ((splitmethod != "fasta") && (splitmethod != "classify")) {
322 if (temp == "not found") { splitmethod = "distance"; }
323 else { splitmethod = temp; }
326 temp = validParameter.validFile(parameters, "classic", false); if (temp == "not found") { temp = "F"; }
327 classic = m->isTrue(temp);
329 if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
331 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.25"; }
332 m->mothurConvert(temp, cutoff);
333 cutoff += (5 / (precision * 10.0));
335 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "3"; }
336 m->mothurConvert(temp, taxLevelCutoff);
338 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "average"; }
340 if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
341 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
343 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
344 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
346 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
348 showabund = validParameter.validFile(parameters, "showabund", false);
349 if (showabund == "not found") { showabund = "T"; }
351 temp = validParameter.validFile(parameters, "cluster", false); if (temp == "not found") { temp = "T"; }
352 runCluster = m->isTrue(temp);
354 timing = validParameter.validFile(parameters, "timing", false);
355 if (timing == "not found") { timing = "F"; }
359 catch(exception& e) {
360 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
365 //**********************************************************************************************************************
367 int ClusterSplitCommand::execute(){
370 if (abort == true) { if (calledHelp) { return 0; } return 2; }
373 vector<string> listFileNames;
375 string singletonName = "";
376 double saveCutoff = cutoff;
378 //****************** file prep work ******************************//
383 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
384 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
386 if (pid == 0) { //only process 0 converts and splits
390 //if user gave a phylip file convert to column file
391 if (format == "phylip") {
393 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
395 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
397 NameAssignment* nameMap = NULL;
398 convert->setFormat("phylip");
399 convert->read(nameMap);
401 if (m->control_pressed) { delete convert; return 0; }
403 distfile = convert->getOutputFile();
405 //if no names file given with phylip file, create it
406 ListVector* listToMakeNameFile = convert->getListVector();
407 if ((namefile == "") && (countfile == "")) { //you need to make a namefile for split matrix
409 namefile = phylipfile + ".names";
410 m->openOutputFile(namefile, out);
411 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
412 string bin = listToMakeNameFile->get(i);
413 out << bin << '\t' << bin << endl;
417 delete listToMakeNameFile;
420 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
422 if (m->control_pressed) { return 0; }
425 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
427 //split matrix into non-overlapping groups
429 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large); }
430 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large); }
431 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir); }
432 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
436 if (m->control_pressed) { delete split; return 0; }
438 singletonName = split->getSingletonNames();
439 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
442 if (m->debug) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); }
444 //output a merged distance file
445 //if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
447 if (m->control_pressed) { return 0; }
449 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
454 m->mothurOutEndLine();
455 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
456 for (int i = 0; i < distName.size(); i++) { m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine(); }
457 m->mothurOutEndLine();
462 //****************** break up files between processes and cluster each file set ******************************//
464 ////you are process 0 from above////
466 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
467 dividedNames.resize(processors);
469 //for each file group figure out which process will complete it
470 //want to divide the load intelligently so the big files are spread between processes
471 for (int i = 0; i < distName.size(); i++) {
472 int processToAssign = (i+1) % processors;
473 if (processToAssign == 0) { processToAssign = processors; }
475 dividedNames[(processToAssign-1)].push_back(distName[i]);
478 //not lets reverse the order of ever other process, so we balance big files running with little ones
479 for (int i = 0; i < processors; i++) {
480 int remainder = ((i+1) % processors);
481 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
485 //send each child the list of files it needs to process
486 for(int i = 1; i < processors; i++) {
487 //send number of file pairs
488 int num = dividedNames[i].size();
489 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
491 for (int j = 0; j < num; j++) { //send filenames to process i
492 char tempDistFileName[1024];
493 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
494 int lengthDist = (dividedNames[i][j].begin()->first).length();
496 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
497 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
499 char tempNameFileName[1024];
500 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
501 int lengthName = (dividedNames[i][j].begin()->second).length();
503 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
504 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
509 listFileNames = cluster(dividedNames[0], labels);
511 //receive the other processes info
512 for(int i = 1; i < processors; i++) {
513 int num = dividedNames[i].size();
516 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
517 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
519 //send list filenames to root process
520 for (int j = 0; j < num; j++) {
522 char tempListFileName[1024];
524 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
525 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
527 string myListFileName = tempListFileName;
528 myListFileName = myListFileName.substr(0, lengthList);
530 listFileNames.push_back(myListFileName);
533 //send Labels to root process
535 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
537 for (int j = 0; j < numLabels; j++) {
541 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
542 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
544 string myLabel = tempLabel;
545 myLabel = myLabel.substr(0, lengthLabel);
547 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
551 }else { //you are a child process
552 vector < map<string, string> > myNames;
554 //recieve the files you need to process
555 //receive number of file pairs
557 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
561 for (int j = 0; j < num; j++) { //receive filenames to process
563 char tempDistFileName[1024];
565 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
566 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
568 string myDistFileName = tempDistFileName;
569 myDistFileName = myDistFileName.substr(0, lengthDist);
572 char tempNameFileName[1024];
574 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
575 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
577 string myNameFileName = tempNameFileName;
578 myNameFileName = myNameFileName.substr(0, lengthName);
581 myNames[j][myDistFileName] = myNameFileName;
585 listFileNames = cluster(myNames, labels);
588 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
590 //send list filenames to root process
591 for (int j = 0; j < num; j++) {
592 char tempListFileName[1024];
593 strcpy(tempListFileName, listFileNames[j].c_str());
594 int lengthList = listFileNames[j].length();
596 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
597 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
600 //send Labels to root process
601 int numLabels = labels.size();
602 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
604 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
606 strcpy(tempLabel, (*it).c_str());
607 int lengthLabel = (*it).length();
609 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
610 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
615 MPI_Barrier(MPI_COMM_WORLD);
618 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
620 if (processors > distName.size()) { processors = distName.size(); }
622 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
624 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
626 listFileNames = createProcesses(distName, labels);
629 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
632 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
634 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
636 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
638 //****************** merge list file and create rabund and sabund files ******************************//
640 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
643 if (pid == 0) { //only process 0 merges
646 ListVector* listSingle;
647 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
649 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
651 mergeLists(listFileNames, labelBins, listSingle);
653 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
655 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
657 //set list file as new current listfile
659 itTypes = outputTypes.find("list");
660 if (itTypes != outputTypes.end()) {
661 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
664 //set rabund file as new current rabundfile
665 itTypes = outputTypes.find("rabund");
666 if (itTypes != outputTypes.end()) {
667 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
670 //set sabund file as new current sabundfile
671 itTypes = outputTypes.find("sabund");
672 if (itTypes != outputTypes.end()) {
673 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
676 m->mothurOutEndLine();
677 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
678 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
679 m->mothurOutEndLine();
682 } //only process 0 merges
685 MPI_Barrier(MPI_COMM_WORLD);
690 catch(exception& e) {
691 m->errorOut(e, "ClusterSplitCommand", "execute");
695 //**********************************************************************************************************************
696 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
699 map<float, int> labelBin;
700 vector<float> orderFloat;
704 if (singleton != "none") {
707 m->openInputFile(singleton, in);
709 string firstCol, secondCol;
710 listSingle = new ListVector();
712 if (countfile != "") { m->getline(in); m->gobble(in); }
715 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
716 if (countfile == "") { listSingle->push_back(secondCol); }
717 else { listSingle->push_back(firstCol); }
721 m->mothurRemove(singleton);
723 numSingleBins = listSingle->getNumBins();
724 }else{ listSingle = NULL; numSingleBins = 0; }
726 //go through users set and make them floats so we can sort them
727 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
730 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
731 else if (*it == "unique") { temp = -1.0; }
733 if (temp <= cutoff) {
734 orderFloat.push_back(temp);
735 labelBin[temp] = numSingleBins; //initialize numbins
740 sort(orderFloat.begin(), orderFloat.end());
743 //get the list info from each file
744 for (int k = 0; k < listNames.size(); k++) {
746 if (m->control_pressed) {
747 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton); }
748 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
752 InputData* input = new InputData(listNames[k], "list");
753 ListVector* list = input->getListVector();
754 string lastLabel = list->getLabel();
756 string filledInList = listNames[k] + "filledInTemp";
758 m->openOutputFile(filledInList, outFilled);
760 //for each label needed
761 for(int l = 0; l < orderFloat.size(); l++){
764 if (orderFloat[l] == -1) { thisLabel = "unique"; }
765 else { thisLabel = toString(orderFloat[l], length-1); }
767 //this file has reached the end
769 list = input->getListVector(lastLabel, true);
770 }else{ //do you have the distance, or do you need to fill in
773 if (list->getLabel() == "unique") { labelFloat = -1.0; }
774 else { convert(list->getLabel(), labelFloat); }
776 //check for missing labels
777 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
778 //if its bigger get last label, otherwise keep it
780 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
782 lastLabel = list->getLabel();
786 list->setLabel(thisLabel);
787 list->print(outFilled);
790 labelBin[orderFloat[l]] += list->getNumBins();
794 list = input->getListVector();
797 if (list != NULL) { delete list; }
801 m->mothurRemove(listNames[k]);
802 rename(filledInList.c_str(), listNames[k].c_str());
807 catch(exception& e) {
808 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
812 //**********************************************************************************************************************
813 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
815 if (outputDir == "") { outputDir += m->hasPath(distfile); }
816 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
818 map<string, string> variables;
819 variables["[filename]"] = fileroot;
820 variables["[clustertag]"] = tag;
821 string sabundFileName = getOutputFileName("sabund", variables);
822 string rabundFileName = getOutputFileName("rabund", variables);
823 if (countfile != "") { variables["[tag2]"] = "unique_list"; }
824 string listFileName = getOutputFileName("list", variables);
826 if (countfile == "") {
827 m->openOutputFile(sabundFileName, outSabund);
828 m->openOutputFile(rabundFileName, outRabund);
829 outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
830 outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
833 m->openOutputFile(listFileName, outList);
834 outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
837 map<float, int>::iterator itLabel;
839 //for each label needed
840 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
843 if (itLabel->first == -1) { thisLabel = "unique"; }
844 else { thisLabel = toString(itLabel->first, length-1); }
846 //outList << thisLabel << '\t' << itLabel->second << '\t';
848 RAbundVector* rabund = NULL;
849 ListVector completeList;
850 completeList.setLabel(thisLabel);
852 if (countfile == "") {
853 rabund = new RAbundVector();
854 rabund->setLabel(thisLabel);
858 if (listSingle != NULL) {
859 for (int j = 0; j < listSingle->getNumBins(); j++) {
860 //outList << listSingle->get(j) << '\t';
861 completeList.push_back(listSingle->get(j));
862 if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
866 //get the list info from each file
867 for (int k = 0; k < listNames.size(); k++) {
869 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } if (rabund != NULL) { delete rabund; } return 0; }
871 InputData* input = new InputData(listNames[k], "list");
872 ListVector* list = input->getListVector(thisLabel);
874 //this file has reached the end
875 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
877 for (int j = 0; j < list->getNumBins(); j++) {
878 //outList << list->get(j) << '\t';
879 completeList.push_back(list->get(j));
880 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
887 if (countfile == "") {
888 SAbundVector sabund = rabund->getSAbundVector();
889 sabund.print(outSabund);
890 rabund->print(outRabund);
893 completeList.print(outList);
895 if (rabund != NULL) { delete rabund; }
899 if (countfile == "") {
903 if (listSingle != NULL) { delete listSingle; }
905 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
909 catch(exception& e) {
910 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
915 //**********************************************************************************************************************
917 void ClusterSplitCommand::printData(ListVector* oldList){
919 string label = oldList->getLabel();
920 RAbundVector oldRAbund = oldList->getRAbundVector();
922 oldRAbund.setLabel(label);
923 if (m->isTrue(showabund)) {
924 oldRAbund.getSAbundVector().print(cout);
926 oldRAbund.print(outRabund);
927 oldRAbund.getSAbundVector().print(outSabund);
929 oldList->print(outList);
931 catch(exception& e) {
932 m->errorOut(e, "ClusterSplitCommand", "printData");
936 //**********************************************************************************************************************
937 vector<string> ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
940 vector<string> listFiles;
941 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
942 dividedNames.resize(processors);
944 //for each file group figure out which process will complete it
945 //want to divide the load intelligently so the big files are spread between processes
946 for (int i = 0; i < distName.size(); i++) {
948 int processToAssign = (i+1) % processors;
949 if (processToAssign == 0) { processToAssign = processors; }
951 dividedNames[(processToAssign-1)].push_back(distName[i]);
952 if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
955 //now lets reverse the order of ever other process, so we balance big files running with little ones
956 for (int i = 0; i < processors; i++) {
958 int remainder = ((i+1) % processors);
959 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
962 if (m->control_pressed) { return listFiles; }
964 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
968 //loop through and create all the processes you want
969 while (process != processors) {
973 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
977 vector<string> listFileNames = cluster(dividedNames[process], labels);
979 //write out names to file
980 string filename = toString(getpid()) + ".temp";
982 m->openOutputFile(filename, out);
984 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
989 filename = toString(getpid()) + ".temp.labels";
990 m->openOutputFile(filename, outLabels);
992 outLabels << cutoff << endl;
993 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
994 outLabels << (*it) << endl;
1000 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1001 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1007 listFiles = cluster(dividedNames[0], labels);
1009 //force parent to wait until all the processes are done
1010 for (int i=0;i< processIDS.size();i++) {
1011 int temp = processIDS[i];
1015 //get list of list file names from each process
1016 for(int i=0;i<processIDS.size();i++){
1017 string filename = toString(processIDS[i]) + ".temp";
1019 m->openInputFile(filename, in);
1021 in >> tag; m->gobble(in);
1025 in >> tempName; m->gobble(in);
1026 listFiles.push_back(tempName);
1029 m->mothurRemove((toString(processIDS[i]) + ".temp"));
1032 filename = toString(processIDS[i]) + ".temp.labels";
1034 m->openInputFile(filename, in2);
1037 in2 >> tempCutoff; m->gobble(in2);
1038 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1042 in2 >> tempName; m->gobble(in2);
1043 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1046 m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1052 //////////////////////////////////////////////////////////////////////////////////////////////////////
1053 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
1054 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1055 //Taking advantage of shared memory to allow both threads to add labels.
1056 //////////////////////////////////////////////////////////////////////////////////////////////////////
1058 vector<clusterData*> pDataArray;
1059 DWORD dwThreadIdArray[processors-1];
1060 HANDLE hThreadArray[processors-1];
1062 //Create processor worker threads.
1063 for( int i=1; i<processors; i++ ){
1064 // Allocate memory for thread data.
1065 clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1066 pDataArray.push_back(tempCluster);
1067 processIDS.push_back(i);
1069 //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1070 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1071 hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
1076 listFiles = cluster(dividedNames[0], labels);
1078 //Wait until all threads have terminated.
1079 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1081 //Close all thread handles and free memory allocations.
1082 for(int i=0; i < pDataArray.size(); i++){
1084 tag = pDataArray[i]->tag;
1085 //get listfiles created
1086 for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1088 set<string>::iterator it;
1089 for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1091 if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1092 CloseHandle(hThreadArray[i]);
1093 delete pDataArray[i];
1101 catch(exception& e) {
1102 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1106 //**********************************************************************************************************************
1108 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1111 vector<string> listFileNames;
1112 double smallestCutoff = cutoff;
1114 //cluster each distance file
1115 for (int i = 0; i < distNames.size(); i++) {
1117 string thisNamefile = distNames[i].begin()->second;
1118 string thisDistFile = distNames[i].begin()->first;
1120 string listFileName = "";
1121 if (classic) { listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
1122 else { listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
1124 if (m->control_pressed) { //clean up
1125 for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); }
1126 listFileNames.clear(); return listFileNames;
1129 listFileNames.push_back(listFileName);
1132 cutoff = smallestCutoff;
1134 return listFileNames;
1137 catch(exception& e) {
1138 m->errorOut(e, "ClusterSplitCommand", "cluster");
1144 //**********************************************************************************************************************
1145 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1147 string listFileName = "";
1149 ListVector* list = NULL;
1151 RAbundVector* rabund = NULL;
1155 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1157 //output your files too
1159 cout << endl << "Reading " << thisDistFile << endl;
1163 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1165 //reads phylip file storing data in 2D vector, also fills list and rabund
1167 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1169 NameAssignment* nameMap = NULL;
1170 CountTable* ct = NULL;
1172 nameMap = new NameAssignment(thisNamefile);
1174 cluster->readPhylipFile(thisDistFile, nameMap);
1175 }else if (countfile != "") {
1176 ct = new CountTable();
1177 ct->readTable(thisNamefile, false);
1178 cluster->readPhylipFile(thisDistFile, ct);
1180 tag = cluster->getTag();
1182 if (m->control_pressed) { if(namefile != ""){ delete nameMap; }
1183 else { delete ct; } delete cluster; return 0; }
1185 list = cluster->getListVector();
1186 rabund = cluster->getRAbundVector();
1188 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1189 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1190 listFileName = fileroot+ tag + ".list";
1193 m->openOutputFile(fileroot+ tag + ".list", listFile);
1195 float previousDist = 0.00000;
1196 float rndPreviousDist = 0.00000;
1200 //output your files too
1202 cout << endl << "Clustering " << thisDistFile << endl;
1206 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1208 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1209 if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close(); if(namefile != ""){ delete nameMap; }
1210 else { delete ct; } return listFileName; }
1212 cluster->update(cutoff);
1214 float dist = cluster->getSmallDist();
1217 rndDist = m->ceilDist(dist, precision);
1219 rndDist = m->roundDist(dist, precision);
1222 if(previousDist <= 0.0000 && dist != previousDist){
1223 oldList.setLabel("unique");
1224 oldList.print(listFile);
1225 if (labels.count("unique") == 0) { labels.insert("unique"); }
1227 else if(rndDist != rndPreviousDist){
1228 oldList.setLabel(toString(rndPreviousDist, length-1));
1229 oldList.print(listFile);
1230 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1234 previousDist = dist;
1235 rndPreviousDist = rndDist;
1239 if(previousDist <= 0.0000){
1240 oldList.setLabel("unique");
1241 oldList.print(listFile);
1242 if (labels.count("unique") == 0) { labels.insert("unique"); }
1244 else if(rndPreviousDist<cutoff){
1245 oldList.setLabel(toString(rndPreviousDist, length-1));
1246 oldList.print(listFile);
1247 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1253 delete cluster; delete list; delete rabund;
1254 if(namefile != ""){ delete nameMap; }
1257 m->mothurRemove(thisDistFile);
1258 m->mothurRemove(thisNamefile);
1260 return listFileName;
1263 catch(exception& e) {
1264 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1269 //**********************************************************************************************************************
1270 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1272 string listFileName = "";
1274 Cluster* cluster = NULL;
1275 SparseDistanceMatrix* matrix = NULL;
1276 ListVector* list = NULL;
1278 RAbundVector* rabund = NULL;
1280 if (m->control_pressed) { return listFileName; }
1284 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1286 //output your files too
1288 cout << endl << "Reading " << thisDistFile << endl;
1292 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1294 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1295 read->setCutoff(cutoff);
1297 NameAssignment* nameMap = NULL;
1298 CountTable* ct = NULL;
1300 nameMap = new NameAssignment(thisNamefile);
1302 read->read(nameMap);
1303 }else if (countfile != "") {
1304 ct = new CountTable();
1305 ct->readTable(thisNamefile, false);
1307 }else { read->read(nameMap); }
1309 list = read->getListVector();
1311 matrix = read->getDMatrix();
1313 if(countfile != "") {
1314 rabund = new RAbundVector();
1315 createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1317 }else { rabund = new RAbundVector(list->getRAbundVector()); }
1319 delete read; read = NULL;
1320 if (namefile != "") { delete nameMap; nameMap = NULL; }
1324 //output your files too
1326 cout << endl << "Clustering " << thisDistFile << endl;
1330 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1333 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1334 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1335 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
1336 tag = cluster->getTag();
1338 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1339 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1342 m->openOutputFile(fileroot+ tag + ".list", listFile);
1344 listFileName = fileroot+ tag + ".list";
1346 float previousDist = 0.00000;
1347 float rndPreviousDist = 0.00000;
1353 double saveCutoff = cutoff;
1355 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1357 if (m->control_pressed) { //clean up
1358 delete matrix; delete list; delete cluster; delete rabund;
1360 m->mothurRemove(listFileName);
1361 return listFileName;
1364 cluster->update(saveCutoff);
1366 float dist = matrix->getSmallDist();
1369 rndDist = m->ceilDist(dist, precision);
1371 rndDist = m->roundDist(dist, precision);
1374 if(previousDist <= 0.0000 && dist != previousDist){
1375 oldList.setLabel("unique");
1376 oldList.print(listFile);
1377 if (labels.count("unique") == 0) { labels.insert("unique"); }
1379 else if(rndDist != rndPreviousDist){
1380 oldList.setLabel(toString(rndPreviousDist, length-1));
1381 oldList.print(listFile);
1382 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1385 previousDist = dist;
1386 rndPreviousDist = rndDist;
1391 if(previousDist <= 0.0000){
1392 oldList.setLabel("unique");
1393 oldList.print(listFile);
1394 if (labels.count("unique") == 0) { labels.insert("unique"); }
1396 else if(rndPreviousDist<cutoff){
1397 oldList.setLabel(toString(rndPreviousDist, length-1));
1398 oldList.print(listFile);
1399 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1402 delete matrix; delete list; delete cluster; delete rabund;
1403 matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1406 if (m->control_pressed) { //clean up
1407 m->mothurRemove(listFileName);
1408 return listFileName;
1411 m->mothurRemove(thisDistFile);
1412 m->mothurRemove(thisNamefile);
1414 if (saveCutoff != cutoff) {
1415 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1416 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1418 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1421 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1423 return listFileName;
1426 catch(exception& e) {
1427 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1431 //**********************************************************************************************************************
1433 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1438 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1443 string thisOutputDir = outputDir;
1444 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1445 map<string, string> variables;
1446 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
1447 string outputFileName = getOutputFileName("column", variables);
1448 m->mothurRemove(outputFileName);
1451 for (int i = 0; i < distNames.size(); i++) {
1452 if (m->control_pressed) { return 0; }
1454 string thisDistFile = distNames[i].begin()->first;
1456 m->appendFiles(thisDistFile, outputFileName);
1459 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1469 catch(exception& e) {
1470 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1474 //**********************************************************************************************************************
1475 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1477 rabund->setLabel(list->getLabel());
1478 for(int i = 0; i < list->getNumBins(); i++) {
1479 if (m->control_pressed) { break; }
1480 vector<string> binNames;
1481 string bin = list->get(i);
1482 m->splitAtComma(bin, binNames);
1484 for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]); }
1485 rabund->push_back(total);
1489 catch(exception& e) {
1490 m->errorOut(e, "ClusterCommand", "createRabund");
1495 //**********************************************************************************************************************