2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
13 //**********************************************************************************************************************
14 vector<string> ClusterSplitCommand::setParameters(){
16 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
17 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
18 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
19 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
20 CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
21 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
22 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
23 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "","",false,false,true); parameters.push_back(psplitmethod);
24 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
25 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
26 CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
27 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
28 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
29 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff);
30 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
31 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false); parameters.push_back(pmethod);
32 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
33 CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
34 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
35 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
37 vector<string> myArray;
38 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
42 m->errorOut(e, "ClusterSplitCommand", "setParameters");
46 //**********************************************************************************************************************
47 string ClusterSplitCommand::getHelpString(){
49 string helpString = "";
50 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
51 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
52 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
53 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n";
54 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
55 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n";
56 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
57 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
58 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
59 helpString += "The name parameter allows you to enter your name file. \n";
60 helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
61 helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
62 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
63 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
64 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
65 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
66 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
67 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
68 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
69 helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic. It is only valid with splitmethod=fasta. Default=f.\n";
71 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
73 helpString += "The cluster.split command should be in the following format: \n";
74 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
75 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
79 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
83 //**********************************************************************************************************************
84 string ClusterSplitCommand::getOutputPattern(string type) {
88 if (type == "list") { pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
89 else if (type == "rabund") { pattern = "[filename],[clustertag],rabund"; }
90 else if (type == "sabund") { pattern = "[filename],[clustertag],sabund"; }
91 else if (type == "column") { pattern = "[filename],dist"; }
92 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
97 m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
101 //**********************************************************************************************************************
102 ClusterSplitCommand::ClusterSplitCommand(){
104 abort = true; calledHelp = true;
106 vector<string> tempOutNames;
107 outputTypes["list"] = tempOutNames;
108 outputTypes["rabund"] = tempOutNames;
109 outputTypes["sabund"] = tempOutNames;
110 outputTypes["column"] = tempOutNames;
112 catch(exception& e) {
113 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
117 //**********************************************************************************************************************
118 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
119 ClusterSplitCommand::ClusterSplitCommand(string option) {
121 abort = false; calledHelp = false;
124 //allow user to run help
125 if(option == "help") { help(); abort = true; calledHelp = true; }
126 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
129 vector<string> myArray = setParameters();
131 OptionParser parser(option);
132 map<string,string> parameters = parser.getParameters();
134 ValidParameters validParameter("cluster.split");
136 //check to make sure all parameters are valid for command
137 map<string,string>::iterator it;
138 for (it = parameters.begin(); it != parameters.end(); it++) {
139 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
144 //initialize outputTypes
145 vector<string> tempOutNames;
146 outputTypes["list"] = tempOutNames;
147 outputTypes["rabund"] = tempOutNames;
148 outputTypes["sabund"] = tempOutNames;
149 outputTypes["column"] = tempOutNames;
151 //if the user changes the output directory command factory will send this info to us in the output parameter
152 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
154 //if the user changes the input directory command factory will send this info to us in the output parameter
155 string inputDir = validParameter.validFile(parameters, "inputdir", false);
156 if (inputDir == "not found"){ inputDir = ""; }
159 it = parameters.find("phylip");
160 //user has given a template file
161 if(it != parameters.end()){
162 path = m->hasPath(it->second);
163 //if the user has not given a path then, add inputdir. else leave path alone.
164 if (path == "") { parameters["phylip"] = inputDir + it->second; }
167 it = parameters.find("column");
168 //user has given a template file
169 if(it != parameters.end()){
170 path = m->hasPath(it->second);
171 //if the user has not given a path then, add inputdir. else leave path alone.
172 if (path == "") { parameters["column"] = inputDir + it->second; }
175 it = parameters.find("name");
176 //user has given a template file
177 if(it != parameters.end()){
178 path = m->hasPath(it->second);
179 //if the user has not given a path then, add inputdir. else leave path alone.
180 if (path == "") { parameters["name"] = inputDir + it->second; }
183 it = parameters.find("taxonomy");
184 //user has given a template file
185 if(it != parameters.end()){
186 path = m->hasPath(it->second);
187 //if the user has not given a path then, add inputdir. else leave path alone.
188 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
191 it = parameters.find("fasta");
192 //user has given a template file
193 if(it != parameters.end()){
194 path = m->hasPath(it->second);
195 //if the user has not given a path then, add inputdir. else leave path alone.
196 if (path == "") { parameters["fasta"] = inputDir + it->second; }
199 it = parameters.find("count");
200 //user has given a template file
201 if(it != parameters.end()){
202 path = m->hasPath(it->second);
203 //if the user has not given a path then, add inputdir. else leave path alone.
204 if (path == "") { parameters["count"] = inputDir + it->second; }
208 //check for required parameters
209 phylipfile = validParameter.validFile(parameters, "phylip", true);
210 if (phylipfile == "not open") { abort = true; }
211 else if (phylipfile == "not found") { phylipfile = ""; }
212 else { distfile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
214 columnfile = validParameter.validFile(parameters, "column", true);
215 if (columnfile == "not open") { abort = true; }
216 else if (columnfile == "not found") { columnfile = ""; }
217 else { distfile = columnfile; format = "column"; m->setColumnFile(columnfile); }
219 namefile = validParameter.validFile(parameters, "name", true);
220 if (namefile == "not open") { abort = true; namefile = "";}
221 else if (namefile == "not found") { namefile = ""; }
222 else { m->setNameFile(namefile); }
224 countfile = validParameter.validFile(parameters, "count", true);
225 if (countfile == "not open") { abort = true; countfile = "";}
226 else if (countfile == "not found") { countfile = ""; }
227 else { m->setCountTableFile(countfile); }
229 fastafile = validParameter.validFile(parameters, "fasta", true);
230 if (fastafile == "not open") { abort = true; }
231 else if (fastafile == "not found") { fastafile = ""; }
232 else { distfile = fastafile; splitmethod = "fasta"; m->setFastaFile(fastafile); }
234 taxFile = validParameter.validFile(parameters, "taxonomy", true);
235 if (taxFile == "not open") { taxFile = ""; abort = true; }
236 else if (taxFile == "not found") { taxFile = ""; }
237 else { m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
239 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
240 //is there are current file available for either of these?
241 //give priority to column, then phylip, then fasta
242 columnfile = m->getColumnFile();
243 if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
245 phylipfile = m->getPhylipFile();
246 if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
248 fastafile = m->getFastaFile();
249 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
251 m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
257 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
259 if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
261 if (columnfile != "") {
262 if ((namefile == "") && (countfile == "")) {
263 namefile = m->getNameFile();
264 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
266 countfile = m->getCountTableFile();
267 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
269 m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
276 if (fastafile != "") {
278 taxFile = m->getTaxonomyFile();
279 if (taxFile != "") { m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
281 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
286 if ((namefile == "") && (countfile == "")) {
287 namefile = m->getNameFile();
288 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
290 countfile = m->getCountTableFile();
291 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
293 m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine();
300 //check for optional parameter and set defaults
301 // ...at some point should added some additional type checking...
302 //get user cutoff and precision or use defaults
304 temp = validParameter.validFile(parameters, "precision", false);
305 if (temp == "not found") { temp = "100"; }
306 //saves precision legnth for formatting below
307 length = temp.length();
308 m->mothurConvert(temp, precision);
310 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
311 hard = m->isTrue(temp);
313 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
314 large = m->isTrue(temp);
316 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
317 m->setProcessors(temp);
318 m->mothurConvert(temp, processors);
320 temp = validParameter.validFile(parameters, "splitmethod", false);
321 if ((splitmethod != "fasta") && (splitmethod != "classify")) {
322 if (temp == "not found") { splitmethod = "distance"; }
323 else { splitmethod = temp; }
326 temp = validParameter.validFile(parameters, "classic", false); if (temp == "not found") { temp = "F"; }
327 classic = m->isTrue(temp);
329 if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
331 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.25"; }
332 m->mothurConvert(temp, cutoff);
333 cutoff += (5 / (precision * 10.0));
335 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "3"; }
336 m->mothurConvert(temp, taxLevelCutoff);
338 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "average"; }
340 if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
341 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
343 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
344 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
346 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
348 showabund = validParameter.validFile(parameters, "showabund", false);
349 if (showabund == "not found") { showabund = "T"; }
351 temp = validParameter.validFile(parameters, "cluster", false); if (temp == "not found") { temp = "T"; }
352 runCluster = m->isTrue(temp);
354 timing = validParameter.validFile(parameters, "timing", false);
355 if (timing == "not found") { timing = "F"; }
359 catch(exception& e) {
360 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
365 //**********************************************************************************************************************
367 int ClusterSplitCommand::execute(){
370 if (abort == true) { if (calledHelp) { return 0; } return 2; }
373 vector<string> listFileNames;
375 string singletonName = "";
376 double saveCutoff = cutoff;
378 //****************** file prep work ******************************//
383 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
384 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
386 if (pid == 0) { //only process 0 converts and splits
390 //if user gave a phylip file convert to column file
391 if (format == "phylip") {
393 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
395 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
397 NameAssignment* nameMap = NULL;
398 convert->setFormat("phylip");
399 convert->read(nameMap);
401 if (m->control_pressed) { delete convert; return 0; }
403 distfile = convert->getOutputFile();
405 //if no names file given with phylip file, create it
406 ListVector* listToMakeNameFile = convert->getListVector();
407 if ((namefile == "") && (countfile == "")) { //you need to make a namefile for split matrix
409 namefile = phylipfile + ".names";
410 m->openOutputFile(namefile, out);
411 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
412 string bin = listToMakeNameFile->get(i);
413 out << bin << '\t' << bin << endl;
417 delete listToMakeNameFile;
420 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
422 if (m->control_pressed) { return 0; }
425 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
427 //split matrix into non-overlapping groups
429 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large); }
430 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large); }
431 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir); }
432 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
436 if (m->control_pressed) { delete split; return 0; }
438 singletonName = split->getSingletonNames();
439 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
442 if (m->debug) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); }
444 //output a merged distance file
445 if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
448 if (m->control_pressed) { return 0; }
450 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
455 m->mothurOutEndLine();
456 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
457 for (int i = 0; i < distName.size(); i++) { m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine(); }
458 m->mothurOutEndLine();
463 //****************** break up files between processes and cluster each file set ******************************//
465 ////you are process 0 from above////
467 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
468 dividedNames.resize(processors);
470 //for each file group figure out which process will complete it
471 //want to divide the load intelligently so the big files are spread between processes
472 for (int i = 0; i < distName.size(); i++) {
473 int processToAssign = (i+1) % processors;
474 if (processToAssign == 0) { processToAssign = processors; }
476 dividedNames[(processToAssign-1)].push_back(distName[i]);
479 //not lets reverse the order of ever other process, so we balance big files running with little ones
480 for (int i = 0; i < processors; i++) {
481 int remainder = ((i+1) % processors);
482 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
486 //send each child the list of files it needs to process
487 for(int i = 1; i < processors; i++) {
488 //send number of file pairs
489 int num = dividedNames[i].size();
490 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
492 for (int j = 0; j < num; j++) { //send filenames to process i
493 char tempDistFileName[1024];
494 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
495 int lengthDist = (dividedNames[i][j].begin()->first).length();
497 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
498 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
500 char tempNameFileName[1024];
501 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
502 int lengthName = (dividedNames[i][j].begin()->second).length();
504 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
505 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
510 listFileNames = cluster(dividedNames[0], labels);
512 //receive the other processes info
513 for(int i = 1; i < processors; i++) {
514 int num = dividedNames[i].size();
517 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
518 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
520 //send list filenames to root process
521 for (int j = 0; j < num; j++) {
523 char tempListFileName[1024];
525 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
526 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
528 string myListFileName = tempListFileName;
529 myListFileName = myListFileName.substr(0, lengthList);
531 listFileNames.push_back(myListFileName);
534 //send Labels to root process
536 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
538 for (int j = 0; j < numLabels; j++) {
542 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
543 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
545 string myLabel = tempLabel;
546 myLabel = myLabel.substr(0, lengthLabel);
548 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
552 }else { //you are a child process
553 vector < map<string, string> > myNames;
555 //recieve the files you need to process
556 //receive number of file pairs
558 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
562 for (int j = 0; j < num; j++) { //receive filenames to process
564 char tempDistFileName[1024];
566 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
567 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
569 string myDistFileName = tempDistFileName;
570 myDistFileName = myDistFileName.substr(0, lengthDist);
573 char tempNameFileName[1024];
575 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
576 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
578 string myNameFileName = tempNameFileName;
579 myNameFileName = myNameFileName.substr(0, lengthName);
582 myNames[j][myDistFileName] = myNameFileName;
586 listFileNames = cluster(myNames, labels);
589 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
591 //send list filenames to root process
592 for (int j = 0; j < num; j++) {
593 char tempListFileName[1024];
594 strcpy(tempListFileName, listFileNames[j].c_str());
595 int lengthList = listFileNames[j].length();
597 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
598 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
601 //send Labels to root process
602 int numLabels = labels.size();
603 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
605 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
607 strcpy(tempLabel, (*it).c_str());
608 int lengthLabel = (*it).length();
610 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
611 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
616 MPI_Barrier(MPI_COMM_WORLD);
619 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
621 if (processors > distName.size()) { processors = distName.size(); }
623 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
625 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
627 listFileNames = createProcesses(distName, labels);
630 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
633 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
635 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
637 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
639 //****************** merge list file and create rabund and sabund files ******************************//
641 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
644 if (pid == 0) { //only process 0 merges
647 ListVector* listSingle;
648 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
650 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
652 mergeLists(listFileNames, labelBins, listSingle);
654 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
656 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
658 //set list file as new current listfile
660 itTypes = outputTypes.find("list");
661 if (itTypes != outputTypes.end()) {
662 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
665 //set rabund file as new current rabundfile
666 itTypes = outputTypes.find("rabund");
667 if (itTypes != outputTypes.end()) {
668 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
671 //set sabund file as new current sabundfile
672 itTypes = outputTypes.find("sabund");
673 if (itTypes != outputTypes.end()) {
674 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
677 m->mothurOutEndLine();
678 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
679 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
680 m->mothurOutEndLine();
683 } //only process 0 merges
686 MPI_Barrier(MPI_COMM_WORLD);
691 catch(exception& e) {
692 m->errorOut(e, "ClusterSplitCommand", "execute");
696 //**********************************************************************************************************************
697 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
700 map<float, int> labelBin;
701 vector<float> orderFloat;
705 if (singleton != "none") {
708 m->openInputFile(singleton, in);
710 string firstCol, secondCol;
711 listSingle = new ListVector();
713 if (countfile != "") { m->getline(in); m->gobble(in); }
716 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
717 if (countfile == "") { listSingle->push_back(secondCol); }
718 else { listSingle->push_back(firstCol); }
722 m->mothurRemove(singleton);
724 numSingleBins = listSingle->getNumBins();
725 }else{ listSingle = NULL; numSingleBins = 0; }
727 //go through users set and make them floats so we can sort them
728 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
731 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
732 else if (*it == "unique") { temp = -1.0; }
734 if (temp <= cutoff) {
735 orderFloat.push_back(temp);
736 labelBin[temp] = numSingleBins; //initialize numbins
741 sort(orderFloat.begin(), orderFloat.end());
744 //get the list info from each file
745 for (int k = 0; k < listNames.size(); k++) {
747 if (m->control_pressed) {
748 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton); }
749 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
753 InputData* input = new InputData(listNames[k], "list");
754 ListVector* list = input->getListVector();
755 string lastLabel = list->getLabel();
757 string filledInList = listNames[k] + "filledInTemp";
759 m->openOutputFile(filledInList, outFilled);
761 //for each label needed
762 for(int l = 0; l < orderFloat.size(); l++){
765 if (orderFloat[l] == -1) { thisLabel = "unique"; }
766 else { thisLabel = toString(orderFloat[l], length-1); }
768 //this file has reached the end
770 list = input->getListVector(lastLabel, true);
771 }else{ //do you have the distance, or do you need to fill in
774 if (list->getLabel() == "unique") { labelFloat = -1.0; }
775 else { convert(list->getLabel(), labelFloat); }
777 //check for missing labels
778 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
779 //if its bigger get last label, otherwise keep it
781 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
783 lastLabel = list->getLabel();
787 list->setLabel(thisLabel);
788 list->print(outFilled);
791 labelBin[orderFloat[l]] += list->getNumBins();
795 list = input->getListVector();
798 if (list != NULL) { delete list; }
802 m->mothurRemove(listNames[k]);
803 rename(filledInList.c_str(), listNames[k].c_str());
808 catch(exception& e) {
809 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
813 //**********************************************************************************************************************
814 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
816 if (outputDir == "") { outputDir += m->hasPath(distfile); }
817 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
819 map<string, string> variables;
820 variables["[filename]"] = fileroot;
821 variables["[clustertag]"] = tag;
822 string sabundFileName = getOutputFileName("sabund", variables);
823 string rabundFileName = getOutputFileName("rabund", variables);
824 if (countfile != "") { variables["[tag2]"] = "unique_list"; }
825 string listFileName = getOutputFileName("list", variables);
827 if (countfile == "") {
828 m->openOutputFile(sabundFileName, outSabund);
829 m->openOutputFile(rabundFileName, outRabund);
830 outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
831 outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
834 m->openOutputFile(listFileName, outList);
835 outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
838 map<float, int>::iterator itLabel;
840 //for each label needed
841 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
844 if (itLabel->first == -1) { thisLabel = "unique"; }
845 else { thisLabel = toString(itLabel->first, length-1); }
847 outList << thisLabel << '\t' << itLabel->second << '\t';
849 RAbundVector* rabund = NULL;
850 if (countfile == "") {
851 rabund = new RAbundVector();
852 rabund->setLabel(thisLabel);
856 if (listSingle != NULL) {
857 for (int j = 0; j < listSingle->getNumBins(); j++) {
858 outList << listSingle->get(j) << '\t';
859 if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
863 //get the list info from each file
864 for (int k = 0; k < listNames.size(); k++) {
866 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } if (rabund != NULL) { delete rabund; } return 0; }
868 InputData* input = new InputData(listNames[k], "list");
869 ListVector* list = input->getListVector(thisLabel);
871 //this file has reached the end
872 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
874 for (int j = 0; j < list->getNumBins(); j++) {
875 outList << list->get(j) << '\t';
876 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
883 if (countfile == "") {
884 SAbundVector sabund = rabund->getSAbundVector();
885 sabund.print(outSabund);
886 rabund->print(outRabund);
890 if (rabund != NULL) { delete rabund; }
894 if (countfile == "") {
898 if (listSingle != NULL) { delete listSingle; }
900 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
904 catch(exception& e) {
905 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
910 //**********************************************************************************************************************
912 void ClusterSplitCommand::printData(ListVector* oldList){
914 string label = oldList->getLabel();
915 RAbundVector oldRAbund = oldList->getRAbundVector();
917 oldRAbund.setLabel(label);
918 if (m->isTrue(showabund)) {
919 oldRAbund.getSAbundVector().print(cout);
921 oldRAbund.print(outRabund);
922 oldRAbund.getSAbundVector().print(outSabund);
924 oldList->print(outList);
926 catch(exception& e) {
927 m->errorOut(e, "ClusterSplitCommand", "printData");
931 //**********************************************************************************************************************
932 vector<string> ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
935 vector<string> listFiles;
936 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
937 dividedNames.resize(processors);
939 //for each file group figure out which process will complete it
940 //want to divide the load intelligently so the big files are spread between processes
941 for (int i = 0; i < distName.size(); i++) {
943 int processToAssign = (i+1) % processors;
944 if (processToAssign == 0) { processToAssign = processors; }
946 dividedNames[(processToAssign-1)].push_back(distName[i]);
947 if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
950 //now lets reverse the order of ever other process, so we balance big files running with little ones
951 for (int i = 0; i < processors; i++) {
953 int remainder = ((i+1) % processors);
954 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
957 if (m->control_pressed) { return listFiles; }
959 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
963 //loop through and create all the processes you want
964 while (process != processors) {
968 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
972 vector<string> listFileNames = cluster(dividedNames[process], labels);
974 //write out names to file
975 string filename = toString(getpid()) + ".temp";
977 m->openOutputFile(filename, out);
979 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
984 filename = toString(getpid()) + ".temp.labels";
985 m->openOutputFile(filename, outLabels);
987 outLabels << cutoff << endl;
988 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
989 outLabels << (*it) << endl;
995 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
996 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1002 listFiles = cluster(dividedNames[0], labels);
1004 //force parent to wait until all the processes are done
1005 for (int i=0;i< processIDS.size();i++) {
1006 int temp = processIDS[i];
1010 //get list of list file names from each process
1011 for(int i=0;i<processIDS.size();i++){
1012 string filename = toString(processIDS[i]) + ".temp";
1014 m->openInputFile(filename, in);
1016 in >> tag; m->gobble(in);
1020 in >> tempName; m->gobble(in);
1021 listFiles.push_back(tempName);
1024 m->mothurRemove((toString(processIDS[i]) + ".temp"));
1027 filename = toString(processIDS[i]) + ".temp.labels";
1029 m->openInputFile(filename, in2);
1032 in2 >> tempCutoff; m->gobble(in2);
1033 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1037 in2 >> tempName; m->gobble(in2);
1038 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1041 m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1047 //////////////////////////////////////////////////////////////////////////////////////////////////////
1048 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
1049 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1050 //Taking advantage of shared memory to allow both threads to add labels.
1051 //////////////////////////////////////////////////////////////////////////////////////////////////////
1053 vector<clusterData*> pDataArray;
1054 DWORD dwThreadIdArray[processors-1];
1055 HANDLE hThreadArray[processors-1];
1057 //Create processor worker threads.
1058 for( int i=1; i<processors; i++ ){
1059 // Allocate memory for thread data.
1060 clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1061 pDataArray.push_back(tempCluster);
1062 processIDS.push_back(i);
1064 //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1065 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1066 hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
1071 listFiles = cluster(dividedNames[0], labels);
1073 //Wait until all threads have terminated.
1074 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1076 //Close all thread handles and free memory allocations.
1077 for(int i=0; i < pDataArray.size(); i++){
1079 tag = pDataArray[i]->tag;
1080 //get listfiles created
1081 for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1083 set<string>::iterator it;
1084 for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1086 if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1087 CloseHandle(hThreadArray[i]);
1088 delete pDataArray[i];
1096 catch(exception& e) {
1097 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1101 //**********************************************************************************************************************
1103 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1106 vector<string> listFileNames;
1107 double smallestCutoff = cutoff;
1109 //cluster each distance file
1110 for (int i = 0; i < distNames.size(); i++) {
1112 string thisNamefile = distNames[i].begin()->second;
1113 string thisDistFile = distNames[i].begin()->first;
1115 string listFileName = "";
1116 if (classic) { listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
1117 else { listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
1119 if (m->control_pressed) { //clean up
1120 for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); }
1121 listFileNames.clear(); return listFileNames;
1124 listFileNames.push_back(listFileName);
1127 cutoff = smallestCutoff;
1129 return listFileNames;
1132 catch(exception& e) {
1133 m->errorOut(e, "ClusterSplitCommand", "cluster");
1139 //**********************************************************************************************************************
1140 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1142 string listFileName = "";
1144 ListVector* list = NULL;
1146 RAbundVector* rabund = NULL;
1150 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1152 //output your files too
1154 cout << endl << "Reading " << thisDistFile << endl;
1158 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1160 //reads phylip file storing data in 2D vector, also fills list and rabund
1162 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1164 NameAssignment* nameMap = NULL;
1165 CountTable* ct = NULL;
1167 nameMap = new NameAssignment(thisNamefile);
1169 cluster->readPhylipFile(thisDistFile, nameMap);
1170 }else if (countfile != "") {
1171 ct = new CountTable();
1172 ct->readTable(thisNamefile);
1173 cluster->readPhylipFile(thisDistFile, ct);
1175 tag = cluster->getTag();
1177 if (m->control_pressed) { if(namefile != ""){ delete nameMap; }
1178 else { delete ct; } delete cluster; return 0; }
1180 list = cluster->getListVector();
1181 rabund = cluster->getRAbundVector();
1183 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1184 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1185 listFileName = fileroot+ tag + ".list";
1188 m->openOutputFile(fileroot+ tag + ".list", listFile);
1190 float previousDist = 0.00000;
1191 float rndPreviousDist = 0.00000;
1195 //output your files too
1197 cout << endl << "Clustering " << thisDistFile << endl;
1201 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1203 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1204 if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close(); if(namefile != ""){ delete nameMap; }
1205 else { delete ct; } return listFileName; }
1207 cluster->update(cutoff);
1209 float dist = cluster->getSmallDist();
1212 rndDist = m->ceilDist(dist, precision);
1214 rndDist = m->roundDist(dist, precision);
1217 if(previousDist <= 0.0000 && dist != previousDist){
1218 oldList.setLabel("unique");
1219 oldList.print(listFile);
1220 if (labels.count("unique") == 0) { labels.insert("unique"); }
1222 else if(rndDist != rndPreviousDist){
1223 oldList.setLabel(toString(rndPreviousDist, length-1));
1224 oldList.print(listFile);
1225 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1229 previousDist = dist;
1230 rndPreviousDist = rndDist;
1234 if(previousDist <= 0.0000){
1235 oldList.setLabel("unique");
1236 oldList.print(listFile);
1237 if (labels.count("unique") == 0) { labels.insert("unique"); }
1239 else if(rndPreviousDist<cutoff){
1240 oldList.setLabel(toString(rndPreviousDist, length-1));
1241 oldList.print(listFile);
1242 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1248 delete cluster; delete list; delete rabund;
1249 if(namefile != ""){ delete nameMap; }
1252 m->mothurRemove(thisDistFile);
1253 m->mothurRemove(thisNamefile);
1255 return listFileName;
1258 catch(exception& e) {
1259 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1264 //**********************************************************************************************************************
1265 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1267 string listFileName = "";
1269 Cluster* cluster = NULL;
1270 SparseDistanceMatrix* matrix = NULL;
1271 ListVector* list = NULL;
1273 RAbundVector* rabund = NULL;
1275 if (m->control_pressed) { return listFileName; }
1279 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1281 //output your files too
1283 cout << endl << "Reading " << thisDistFile << endl;
1287 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1289 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1290 read->setCutoff(cutoff);
1292 NameAssignment* nameMap = NULL;
1293 CountTable* ct = NULL;
1295 nameMap = new NameAssignment(thisNamefile);
1297 read->read(nameMap);
1298 }else if (countfile != "") {
1299 ct = new CountTable();
1300 ct->readTable(thisNamefile);
1302 }else { read->read(nameMap); }
1304 list = read->getListVector();
1306 matrix = read->getDMatrix();
1308 if(countfile != "") {
1309 rabund = new RAbundVector();
1310 createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1312 }else { rabund = new RAbundVector(list->getRAbundVector()); }
1314 delete read; read = NULL;
1315 if (namefile != "") { delete nameMap; nameMap = NULL; }
1319 //output your files too
1321 cout << endl << "Clustering " << thisDistFile << endl;
1325 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1328 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1329 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1330 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
1331 tag = cluster->getTag();
1333 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1334 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1337 m->openOutputFile(fileroot+ tag + ".list", listFile);
1339 listFileName = fileroot+ tag + ".list";
1341 float previousDist = 0.00000;
1342 float rndPreviousDist = 0.00000;
1348 double saveCutoff = cutoff;
1350 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1352 if (m->control_pressed) { //clean up
1353 delete matrix; delete list; delete cluster; delete rabund;
1355 m->mothurRemove(listFileName);
1356 return listFileName;
1359 cluster->update(saveCutoff);
1361 float dist = matrix->getSmallDist();
1364 rndDist = m->ceilDist(dist, precision);
1366 rndDist = m->roundDist(dist, precision);
1369 if(previousDist <= 0.0000 && dist != previousDist){
1370 oldList.setLabel("unique");
1371 oldList.print(listFile);
1372 if (labels.count("unique") == 0) { labels.insert("unique"); }
1374 else if(rndDist != rndPreviousDist){
1375 oldList.setLabel(toString(rndPreviousDist, length-1));
1376 oldList.print(listFile);
1377 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1380 previousDist = dist;
1381 rndPreviousDist = rndDist;
1386 if(previousDist <= 0.0000){
1387 oldList.setLabel("unique");
1388 oldList.print(listFile);
1389 if (labels.count("unique") == 0) { labels.insert("unique"); }
1391 else if(rndPreviousDist<cutoff){
1392 oldList.setLabel(toString(rndPreviousDist, length-1));
1393 oldList.print(listFile);
1394 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1397 delete matrix; delete list; delete cluster; delete rabund;
1398 matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1401 if (m->control_pressed) { //clean up
1402 m->mothurRemove(listFileName);
1403 return listFileName;
1406 m->mothurRemove(thisDistFile);
1407 m->mothurRemove(thisNamefile);
1409 if (saveCutoff != cutoff) {
1410 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1411 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1413 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1416 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1418 return listFileName;
1421 catch(exception& e) {
1422 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1426 //**********************************************************************************************************************
1428 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1433 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1438 string thisOutputDir = outputDir;
1439 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1440 map<string, string> variables;
1441 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
1442 string outputFileName = getOutputFileName("column", variables);
1443 m->mothurRemove(outputFileName);
1446 for (int i = 0; i < distNames.size(); i++) {
1447 if (m->control_pressed) { return 0; }
1449 string thisDistFile = distNames[i].begin()->first;
1451 m->appendFiles(thisDistFile, outputFileName);
1454 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1464 catch(exception& e) {
1465 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1469 //**********************************************************************************************************************
1470 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1472 rabund->setLabel(list->getLabel());
1473 for(int i = 0; i < list->getNumBins(); i++) {
1474 if (m->control_pressed) { break; }
1475 vector<string> binNames;
1476 string bin = list->get(i);
1477 m->splitAtComma(bin, binNames);
1479 for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]); }
1480 rabund->push_back(total);
1484 catch(exception& e) {
1485 m->errorOut(e, "ClusterCommand", "createRabund");
1490 //**********************************************************************************************************************