2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
13 //**********************************************************************************************************************
14 vector<string> ClusterSplitCommand::setParameters(){
16 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
17 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
18 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
19 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
20 CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
21 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
22 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
23 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "","",false,false,true); parameters.push_back(psplitmethod);
24 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
25 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
26 CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
27 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
28 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
29 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff);
30 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
31 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false); parameters.push_back(pmethod);
32 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
33 CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
34 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
35 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
37 vector<string> myArray;
38 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
42 m->errorOut(e, "ClusterSplitCommand", "setParameters");
46 //**********************************************************************************************************************
47 string ClusterSplitCommand::getHelpString(){
49 string helpString = "";
50 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
51 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
52 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
53 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n";
54 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
55 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n";
56 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
57 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
58 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
59 helpString += "The name parameter allows you to enter your name file. \n";
60 helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
61 helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
62 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
63 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
64 helpString += "The method allows you to specify what clustering algorithm you want to use, default=average, option furthest, nearest, or average. \n";
65 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
66 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
67 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
68 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
69 helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic. It is only valid with splitmethod=fasta. Default=f.\n";
71 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
73 helpString += "The cluster.split command should be in the following format: \n";
74 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
75 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
79 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
83 //**********************************************************************************************************************
84 string ClusterSplitCommand::getOutputPattern(string type) {
88 if (type == "list") { pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
89 else if (type == "rabund") { pattern = "[filename],[clustertag],rabund"; }
90 else if (type == "sabund") { pattern = "[filename],[clustertag],sabund"; }
91 else if (type == "column") { pattern = "[filename],dist"; }
92 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
97 m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
101 //**********************************************************************************************************************
102 ClusterSplitCommand::ClusterSplitCommand(){
104 abort = true; calledHelp = true;
106 vector<string> tempOutNames;
107 outputTypes["list"] = tempOutNames;
108 outputTypes["rabund"] = tempOutNames;
109 outputTypes["sabund"] = tempOutNames;
110 outputTypes["column"] = tempOutNames;
112 catch(exception& e) {
113 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
117 //**********************************************************************************************************************
118 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
119 ClusterSplitCommand::ClusterSplitCommand(string option) {
121 abort = false; calledHelp = false;
124 //allow user to run help
125 if(option == "help") { help(); abort = true; calledHelp = true; }
126 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
129 vector<string> myArray = setParameters();
131 OptionParser parser(option);
132 map<string,string> parameters = parser.getParameters();
134 ValidParameters validParameter("cluster.split");
136 //check to make sure all parameters are valid for command
137 map<string,string>::iterator it;
138 for (it = parameters.begin(); it != parameters.end(); it++) {
139 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
144 //initialize outputTypes
145 vector<string> tempOutNames;
146 outputTypes["list"] = tempOutNames;
147 outputTypes["rabund"] = tempOutNames;
148 outputTypes["sabund"] = tempOutNames;
149 outputTypes["column"] = tempOutNames;
151 //if the user changes the output directory command factory will send this info to us in the output parameter
152 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
154 //if the user changes the input directory command factory will send this info to us in the output parameter
155 string inputDir = validParameter.validFile(parameters, "inputdir", false);
156 if (inputDir == "not found"){ inputDir = ""; }
159 it = parameters.find("phylip");
160 //user has given a template file
161 if(it != parameters.end()){
162 path = m->hasPath(it->second);
163 //if the user has not given a path then, add inputdir. else leave path alone.
164 if (path == "") { parameters["phylip"] = inputDir + it->second; }
167 it = parameters.find("column");
168 //user has given a template file
169 if(it != parameters.end()){
170 path = m->hasPath(it->second);
171 //if the user has not given a path then, add inputdir. else leave path alone.
172 if (path == "") { parameters["column"] = inputDir + it->second; }
175 it = parameters.find("name");
176 //user has given a template file
177 if(it != parameters.end()){
178 path = m->hasPath(it->second);
179 //if the user has not given a path then, add inputdir. else leave path alone.
180 if (path == "") { parameters["name"] = inputDir + it->second; }
183 it = parameters.find("taxonomy");
184 //user has given a template file
185 if(it != parameters.end()){
186 path = m->hasPath(it->second);
187 //if the user has not given a path then, add inputdir. else leave path alone.
188 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
191 it = parameters.find("fasta");
192 //user has given a template file
193 if(it != parameters.end()){
194 path = m->hasPath(it->second);
195 //if the user has not given a path then, add inputdir. else leave path alone.
196 if (path == "") { parameters["fasta"] = inputDir + it->second; }
199 it = parameters.find("count");
200 //user has given a template file
201 if(it != parameters.end()){
202 path = m->hasPath(it->second);
203 //if the user has not given a path then, add inputdir. else leave path alone.
204 if (path == "") { parameters["count"] = inputDir + it->second; }
208 //check for required parameters
209 phylipfile = validParameter.validFile(parameters, "phylip", true);
210 if (phylipfile == "not open") { abort = true; }
211 else if (phylipfile == "not found") { phylipfile = ""; }
212 else { distfile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
214 columnfile = validParameter.validFile(parameters, "column", true);
215 if (columnfile == "not open") { abort = true; }
216 else if (columnfile == "not found") { columnfile = ""; }
217 else { distfile = columnfile; format = "column"; m->setColumnFile(columnfile); }
219 namefile = validParameter.validFile(parameters, "name", true);
220 if (namefile == "not open") { abort = true; namefile = "";}
221 else if (namefile == "not found") { namefile = ""; }
222 else { m->setNameFile(namefile); }
224 countfile = validParameter.validFile(parameters, "count", true);
225 if (countfile == "not open") { abort = true; countfile = "";}
226 else if (countfile == "not found") { countfile = ""; }
227 else { m->setCountTableFile(countfile); }
229 fastafile = validParameter.validFile(parameters, "fasta", true);
230 if (fastafile == "not open") { abort = true; }
231 else if (fastafile == "not found") { fastafile = ""; }
232 else { distfile = fastafile; splitmethod = "fasta"; m->setFastaFile(fastafile); }
234 taxFile = validParameter.validFile(parameters, "taxonomy", true);
235 if (taxFile == "not open") { taxFile = ""; abort = true; }
236 else if (taxFile == "not found") { taxFile = ""; }
237 else { m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
239 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
240 //is there are current file available for either of these?
241 //give priority to column, then phylip, then fasta
242 columnfile = m->getColumnFile();
243 if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
245 phylipfile = m->getPhylipFile();
246 if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
248 fastafile = m->getFastaFile();
249 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
251 m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
257 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
259 if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
261 if (columnfile != "") {
262 if ((namefile == "") && (countfile == "")) {
263 namefile = m->getNameFile();
264 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
266 countfile = m->getCountTableFile();
267 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
269 m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
276 if (fastafile != "") {
278 taxFile = m->getTaxonomyFile();
279 if (taxFile != "") { m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
281 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
286 if ((namefile == "") && (countfile == "")) {
287 namefile = m->getNameFile();
288 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
290 countfile = m->getCountTableFile();
291 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
293 m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine();
300 //check for optional parameter and set defaults
301 // ...at some point should added some additional type checking...
302 //get user cutoff and precision or use defaults
304 temp = validParameter.validFile(parameters, "precision", false);
305 if (temp == "not found") { temp = "100"; }
306 //saves precision legnth for formatting below
307 length = temp.length();
308 m->mothurConvert(temp, precision);
310 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
311 hard = m->isTrue(temp);
313 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
314 large = m->isTrue(temp);
316 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
317 m->setProcessors(temp);
318 m->mothurConvert(temp, processors);
320 temp = validParameter.validFile(parameters, "splitmethod", false);
321 if ((splitmethod != "fasta") && (splitmethod != "classify")) {
322 if (temp == "not found") { splitmethod = "distance"; }
323 else { splitmethod = temp; }
326 temp = validParameter.validFile(parameters, "classic", false); if (temp == "not found") { temp = "F"; }
327 classic = m->isTrue(temp);
329 if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
331 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.25"; }
332 m->mothurConvert(temp, cutoff);
333 cutoff += (5 / (precision * 10.0));
335 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "3"; }
336 m->mothurConvert(temp, taxLevelCutoff);
338 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "average"; }
340 if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
341 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
343 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
344 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
346 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
348 showabund = validParameter.validFile(parameters, "showabund", false);
349 if (showabund == "not found") { showabund = "T"; }
351 temp = validParameter.validFile(parameters, "cluster", false); if (temp == "not found") { temp = "T"; }
352 runCluster = m->isTrue(temp);
354 timing = validParameter.validFile(parameters, "timing", false);
355 if (timing == "not found") { timing = "F"; }
359 catch(exception& e) {
360 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
365 //**********************************************************************************************************************
367 int ClusterSplitCommand::execute(){
370 if (abort == true) { if (calledHelp) { return 0; } return 2; }
373 vector<string> listFileNames;
375 string singletonName = "";
376 double saveCutoff = cutoff;
378 //****************** file prep work ******************************//
383 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
384 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
386 if (pid == 0) { //only process 0 converts and splits
390 //if user gave a phylip file convert to column file
391 if (format == "phylip") {
393 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
395 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
397 NameAssignment* nameMap = NULL;
398 convert->setFormat("phylip");
399 convert->read(nameMap);
401 if (m->control_pressed) { delete convert; return 0; }
403 distfile = convert->getOutputFile();
405 //if no names file given with phylip file, create it
406 ListVector* listToMakeNameFile = convert->getListVector();
407 if ((namefile == "") && (countfile == "")) { //you need to make a namefile for split matrix
409 namefile = phylipfile + ".names";
410 m->openOutputFile(namefile, out);
411 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
412 string bin = listToMakeNameFile->get(i);
413 out << bin << '\t' << bin << endl;
417 delete listToMakeNameFile;
420 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
422 if (m->control_pressed) { return 0; }
425 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
427 //split matrix into non-overlapping groups
429 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large); }
430 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large); }
431 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir); }
432 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
436 if (m->control_pressed) { delete split; return 0; }
438 singletonName = split->getSingletonNames();
439 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
442 if (m->debug) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); }
444 //output a merged distance file
445 //if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
447 if (m->control_pressed) { return 0; }
449 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
454 m->mothurOutEndLine();
455 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
456 for (int i = 0; i < distName.size(); i++) { m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine(); }
457 m->mothurOutEndLine();
462 //****************** break up files between processes and cluster each file set ******************************//
464 ////you are process 0 from above////
466 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
467 dividedNames.resize(processors);
469 //for each file group figure out which process will complete it
470 //want to divide the load intelligently so the big files are spread between processes
471 for (int i = 0; i < distName.size(); i++) {
472 int processToAssign = (i+1) % processors;
473 if (processToAssign == 0) { processToAssign = processors; }
475 dividedNames[(processToAssign-1)].push_back(distName[i]);
478 //not lets reverse the order of ever other process, so we balance big files running with little ones
479 for (int i = 0; i < processors; i++) {
480 int remainder = ((i+1) % processors);
481 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
485 //send each child the list of files it needs to process
486 for(int i = 1; i < processors; i++) {
487 //send number of file pairs
488 int num = dividedNames[i].size();
489 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
491 for (int j = 0; j < num; j++) { //send filenames to process i
492 char tempDistFileName[1024];
493 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
494 int lengthDist = (dividedNames[i][j].begin()->first).length();
496 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
497 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
499 char tempNameFileName[1024];
500 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
501 int lengthName = (dividedNames[i][j].begin()->second).length();
503 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
504 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
509 listFileNames = cluster(dividedNames[0], labels);
511 //receive the other processes info
512 for(int i = 1; i < processors; i++) {
513 int num = dividedNames[i].size();
516 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
517 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
519 //send list filenames to root process
520 for (int j = 0; j < num; j++) {
522 char tempListFileName[1024];
524 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
525 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
527 string myListFileName = tempListFileName;
528 myListFileName = myListFileName.substr(0, lengthList);
530 listFileNames.push_back(myListFileName);
533 //send Labels to root process
535 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
537 for (int j = 0; j < numLabels; j++) {
541 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
542 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
544 string myLabel = tempLabel;
545 myLabel = myLabel.substr(0, lengthLabel);
547 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
551 }else { //you are a child process
552 vector < map<string, string> > myNames;
554 //recieve the files you need to process
555 //receive number of file pairs
557 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
561 for (int j = 0; j < num; j++) { //receive filenames to process
563 char tempDistFileName[1024];
565 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
566 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
568 string myDistFileName = tempDistFileName;
569 myDistFileName = myDistFileName.substr(0, lengthDist);
572 char tempNameFileName[1024];
574 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
575 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
577 string myNameFileName = tempNameFileName;
578 myNameFileName = myNameFileName.substr(0, lengthName);
581 myNames[j][myDistFileName] = myNameFileName;
585 listFileNames = cluster(myNames, labels);
588 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
590 //send list filenames to root process
591 for (int j = 0; j < num; j++) {
592 char tempListFileName[1024];
593 strcpy(tempListFileName, listFileNames[j].c_str());
594 int lengthList = listFileNames[j].length();
596 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
597 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
600 //send Labels to root process
601 int numLabels = labels.size();
602 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
604 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
606 strcpy(tempLabel, (*it).c_str());
607 int lengthLabel = (*it).length();
609 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
610 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
615 MPI_Barrier(MPI_COMM_WORLD);
618 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
620 if (processors > distName.size()) { processors = distName.size(); }
622 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
624 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
626 listFileNames = createProcesses(distName, labels);
629 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
632 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
634 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
636 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
638 //****************** merge list file and create rabund and sabund files ******************************//
640 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
643 if (pid == 0) { //only process 0 merges
646 ListVector* listSingle;
647 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
649 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
651 mergeLists(listFileNames, labelBins, listSingle);
653 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
655 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
657 //set list file as new current listfile
659 itTypes = outputTypes.find("list");
660 if (itTypes != outputTypes.end()) {
661 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
664 //set rabund file as new current rabundfile
665 itTypes = outputTypes.find("rabund");
666 if (itTypes != outputTypes.end()) {
667 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
670 //set sabund file as new current sabundfile
671 itTypes = outputTypes.find("sabund");
672 if (itTypes != outputTypes.end()) {
673 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
676 m->mothurOutEndLine();
677 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
678 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
679 m->mothurOutEndLine();
682 } //only process 0 merges
685 MPI_Barrier(MPI_COMM_WORLD);
690 catch(exception& e) {
691 m->errorOut(e, "ClusterSplitCommand", "execute");
695 //**********************************************************************************************************************
696 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
699 map<float, int> labelBin;
700 vector<float> orderFloat;
704 if (singleton != "none") {
707 m->openInputFile(singleton, in);
709 string firstCol, secondCol;
710 listSingle = new ListVector();
712 if (countfile != "") { m->getline(in); m->gobble(in); }
715 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
716 if (countfile == "") { listSingle->push_back(secondCol); }
717 else { listSingle->push_back(firstCol); }
721 m->mothurRemove(singleton);
723 numSingleBins = listSingle->getNumBins();
724 }else{ listSingle = NULL; numSingleBins = 0; }
726 //go through users set and make them floats so we can sort them
727 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
730 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
731 else if (*it == "unique") { temp = -1.0; }
733 if (temp <= cutoff) {
734 orderFloat.push_back(temp);
735 labelBin[temp] = numSingleBins; //initialize numbins
740 sort(orderFloat.begin(), orderFloat.end());
743 //get the list info from each file
744 for (int k = 0; k < listNames.size(); k++) {
746 if (m->control_pressed) {
747 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton); }
748 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
752 InputData* input = new InputData(listNames[k], "list");
753 ListVector* list = input->getListVector();
754 string lastLabel = list->getLabel();
756 string filledInList = listNames[k] + "filledInTemp";
758 m->openOutputFile(filledInList, outFilled);
760 //for each label needed
761 for(int l = 0; l < orderFloat.size(); l++){
764 if (orderFloat[l] == -1) { thisLabel = "unique"; }
765 else { thisLabel = toString(orderFloat[l], length-1); }
767 //this file has reached the end
769 list = input->getListVector(lastLabel, true);
770 }else{ //do you have the distance, or do you need to fill in
773 if (list->getLabel() == "unique") { labelFloat = -1.0; }
774 else { convert(list->getLabel(), labelFloat); }
776 //check for missing labels
777 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
778 //if its bigger get last label, otherwise keep it
780 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
782 lastLabel = list->getLabel();
786 list->setLabel(thisLabel);
787 list->print(outFilled);
790 labelBin[orderFloat[l]] += list->getNumBins();
794 list = input->getListVector();
797 if (list != NULL) { delete list; }
801 m->mothurRemove(listNames[k]);
802 rename(filledInList.c_str(), listNames[k].c_str());
807 catch(exception& e) {
808 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
812 //**********************************************************************************************************************
813 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
815 if (outputDir == "") { outputDir += m->hasPath(distfile); }
816 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
818 map<string, string> variables;
819 variables["[filename]"] = fileroot;
820 variables["[clustertag]"] = tag;
821 string sabundFileName = getOutputFileName("sabund", variables);
822 string rabundFileName = getOutputFileName("rabund", variables);
823 if (countfile != "") { variables["[tag2]"] = "unique_list"; }
824 string listFileName = getOutputFileName("list", variables);
826 if (countfile == "") {
827 m->openOutputFile(sabundFileName, outSabund);
828 m->openOutputFile(rabundFileName, outRabund);
829 outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
830 outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
833 m->openOutputFile(listFileName, outList);
834 outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
836 map<float, int>::iterator itLabel;
838 //clears out junk for autocompleting of list files above. Perhaps there is a beter way to handle this from within the data structure?
839 m->printedListHeaders = false;
841 //for each label needed
842 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
845 if (itLabel->first == -1) { thisLabel = "unique"; }
846 else { thisLabel = toString(itLabel->first, length-1); }
848 //outList << thisLabel << '\t' << itLabel->second << '\t';
850 RAbundVector* rabund = NULL;
851 ListVector completeList;
852 completeList.setLabel(thisLabel);
854 if (countfile == "") {
855 rabund = new RAbundVector();
856 rabund->setLabel(thisLabel);
860 if (listSingle != NULL) {
861 for (int j = 0; j < listSingle->getNumBins(); j++) {
862 //outList << listSingle->get(j) << '\t';
863 completeList.push_back(listSingle->get(j));
864 if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
868 //get the list info from each file
869 for (int k = 0; k < listNames.size(); k++) {
871 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } if (rabund != NULL) { delete rabund; } return 0; }
873 InputData* input = new InputData(listNames[k], "list");
874 ListVector* list = input->getListVector(thisLabel);
876 //this file has reached the end
877 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
879 for (int j = 0; j < list->getNumBins(); j++) {
880 //outList << list->get(j) << '\t';
881 completeList.push_back(list->get(j));
882 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
889 if (countfile == "") {
890 SAbundVector sabund = rabund->getSAbundVector();
891 sabund.print(outSabund);
892 rabund->print(outRabund);
895 if (!m->printedListHeaders) {
896 m->listBinLabelsInFile.clear(); completeList.printHeaders(outList); }
897 completeList.print(outList);
899 if (rabund != NULL) { delete rabund; }
903 if (countfile == "") {
907 if (listSingle != NULL) { delete listSingle; }
909 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
913 catch(exception& e) {
914 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
919 //**********************************************************************************************************************
921 void ClusterSplitCommand::printData(ListVector* oldList){
923 string label = oldList->getLabel();
924 RAbundVector oldRAbund = oldList->getRAbundVector();
926 oldRAbund.setLabel(label);
927 if (m->isTrue(showabund)) {
928 oldRAbund.getSAbundVector().print(cout);
930 oldRAbund.print(outRabund);
931 oldRAbund.getSAbundVector().print(outSabund);
933 oldList->print(outList);
935 catch(exception& e) {
936 m->errorOut(e, "ClusterSplitCommand", "printData");
940 //**********************************************************************************************************************
941 vector<string> ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
944 vector<string> listFiles;
945 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
946 dividedNames.resize(processors);
948 //for each file group figure out which process will complete it
949 //want to divide the load intelligently so the big files are spread between processes
950 for (int i = 0; i < distName.size(); i++) {
952 int processToAssign = (i+1) % processors;
953 if (processToAssign == 0) { processToAssign = processors; }
955 dividedNames[(processToAssign-1)].push_back(distName[i]);
956 if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
959 //now lets reverse the order of ever other process, so we balance big files running with little ones
960 for (int i = 0; i < processors; i++) {
962 int remainder = ((i+1) % processors);
963 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
966 if (m->control_pressed) { return listFiles; }
968 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
972 //loop through and create all the processes you want
973 while (process != processors) {
977 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
981 vector<string> listFileNames = cluster(dividedNames[process], labels);
983 //write out names to file
984 string filename = toString(getpid()) + ".temp";
986 m->openOutputFile(filename, out);
988 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
993 filename = toString(getpid()) + ".temp.labels";
994 m->openOutputFile(filename, outLabels);
996 outLabels << cutoff << endl;
997 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
998 outLabels << (*it) << endl;
1004 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1005 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1011 listFiles = cluster(dividedNames[0], labels);
1013 //force parent to wait until all the processes are done
1014 for (int i=0;i< processIDS.size();i++) {
1015 int temp = processIDS[i];
1019 //get list of list file names from each process
1020 for(int i=0;i<processIDS.size();i++){
1021 string filename = toString(processIDS[i]) + ".temp";
1023 m->openInputFile(filename, in);
1025 in >> tag; m->gobble(in);
1029 in >> tempName; m->gobble(in);
1030 listFiles.push_back(tempName);
1033 m->mothurRemove((toString(processIDS[i]) + ".temp"));
1036 filename = toString(processIDS[i]) + ".temp.labels";
1038 m->openInputFile(filename, in2);
1041 in2 >> tempCutoff; m->gobble(in2);
1042 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1046 in2 >> tempName; m->gobble(in2);
1047 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1050 m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1056 //////////////////////////////////////////////////////////////////////////////////////////////////////
1057 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
1058 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1059 //Taking advantage of shared memory to allow both threads to add labels.
1060 //////////////////////////////////////////////////////////////////////////////////////////////////////
1062 vector<clusterData*> pDataArray;
1063 DWORD dwThreadIdArray[processors-1];
1064 HANDLE hThreadArray[processors-1];
1066 //Create processor worker threads.
1067 for( int i=1; i<processors; i++ ){
1068 // Allocate memory for thread data.
1069 clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1070 pDataArray.push_back(tempCluster);
1071 processIDS.push_back(i);
1073 //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1074 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1075 hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
1080 listFiles = cluster(dividedNames[0], labels);
1082 //Wait until all threads have terminated.
1083 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1085 //Close all thread handles and free memory allocations.
1086 for(int i=0; i < pDataArray.size(); i++){
1088 tag = pDataArray[i]->tag;
1089 //get listfiles created
1090 for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1092 set<string>::iterator it;
1093 for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1095 if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1096 CloseHandle(hThreadArray[i]);
1097 delete pDataArray[i];
1105 catch(exception& e) {
1106 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1110 //**********************************************************************************************************************
1112 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1115 vector<string> listFileNames;
1116 double smallestCutoff = cutoff;
1118 //cluster each distance file
1119 for (int i = 0; i < distNames.size(); i++) {
1121 string thisNamefile = distNames[i].begin()->second;
1122 string thisDistFile = distNames[i].begin()->first;
1124 string listFileName = "";
1125 if (classic) { listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
1126 else { listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
1128 if (m->control_pressed) { //clean up
1129 for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); }
1130 listFileNames.clear(); return listFileNames;
1133 listFileNames.push_back(listFileName);
1136 cutoff = smallestCutoff;
1138 return listFileNames;
1141 catch(exception& e) {
1142 m->errorOut(e, "ClusterSplitCommand", "cluster");
1148 //**********************************************************************************************************************
1149 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1151 string listFileName = "";
1153 ListVector* list = NULL;
1155 RAbundVector* rabund = NULL;
1159 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1161 //output your files too
1163 cout << endl << "Reading " << thisDistFile << endl;
1167 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1169 //reads phylip file storing data in 2D vector, also fills list and rabund
1171 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1173 NameAssignment* nameMap = NULL;
1174 CountTable* ct = NULL;
1176 nameMap = new NameAssignment(thisNamefile);
1178 cluster->readPhylipFile(thisDistFile, nameMap);
1179 }else if (countfile != "") {
1180 ct = new CountTable();
1181 ct->readTable(thisNamefile, false, false);
1182 cluster->readPhylipFile(thisDistFile, ct);
1184 tag = cluster->getTag();
1186 if (m->control_pressed) { if(namefile != ""){ delete nameMap; }
1187 else { delete ct; } delete cluster; return 0; }
1189 list = cluster->getListVector();
1190 rabund = cluster->getRAbundVector();
1192 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1193 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1194 listFileName = fileroot+ tag + ".list";
1197 m->openOutputFile(fileroot+ tag + ".list", listFile);
1199 float previousDist = 0.00000;
1200 float rndPreviousDist = 0.00000;
1204 //output your files too
1206 cout << endl << "Clustering " << thisDistFile << endl;
1210 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1212 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1213 if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close(); if(namefile != ""){ delete nameMap; }
1214 else { delete ct; } return listFileName; }
1216 cluster->update(cutoff);
1218 float dist = cluster->getSmallDist();
1221 rndDist = m->ceilDist(dist, precision);
1223 rndDist = m->roundDist(dist, precision);
1226 if(previousDist <= 0.0000 && dist != previousDist){
1227 oldList.setLabel("unique");
1228 oldList.print(listFile);
1229 if (labels.count("unique") == 0) { labels.insert("unique"); }
1231 else if(rndDist != rndPreviousDist){
1232 oldList.setLabel(toString(rndPreviousDist, length-1));
1233 oldList.print(listFile);
1234 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1238 previousDist = dist;
1239 rndPreviousDist = rndDist;
1243 if(previousDist <= 0.0000){
1244 oldList.setLabel("unique");
1245 oldList.print(listFile);
1246 if (labels.count("unique") == 0) { labels.insert("unique"); }
1248 else if(rndPreviousDist<cutoff){
1249 oldList.setLabel(toString(rndPreviousDist, length-1));
1250 oldList.print(listFile);
1251 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1257 delete cluster; delete list; delete rabund;
1258 if(namefile != ""){ delete nameMap; }
1261 m->mothurRemove(thisDistFile);
1262 m->mothurRemove(thisNamefile);
1264 return listFileName;
1267 catch(exception& e) {
1268 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1273 //**********************************************************************************************************************
1274 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1276 string listFileName = "";
1278 Cluster* cluster = NULL;
1279 SparseDistanceMatrix* matrix = NULL;
1280 ListVector* list = NULL;
1282 RAbundVector* rabund = NULL;
1284 if (m->control_pressed) { return listFileName; }
1288 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1290 //output your files too
1292 cout << endl << "Reading " << thisDistFile << endl;
1296 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1298 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1299 read->setCutoff(cutoff);
1301 NameAssignment* nameMap = NULL;
1302 CountTable* ct = NULL;
1304 nameMap = new NameAssignment(thisNamefile);
1306 read->read(nameMap);
1307 }else if (countfile != "") {
1308 ct = new CountTable();
1309 ct->readTable(thisNamefile, false, false);
1311 }else { read->read(nameMap); }
1313 list = read->getListVector();
1315 matrix = read->getDMatrix();
1317 if(countfile != "") {
1318 rabund = new RAbundVector();
1319 createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1321 }else { rabund = new RAbundVector(list->getRAbundVector()); }
1323 delete read; read = NULL;
1324 if (namefile != "") { delete nameMap; nameMap = NULL; }
1328 //output your files too
1330 cout << endl << "Clustering " << thisDistFile << endl;
1334 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1337 float adjust = -1.0;
1338 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method, adjust); }
1339 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method, adjust); }
1340 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method, adjust); }
1341 tag = cluster->getTag();
1343 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1344 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1347 m->openOutputFile(fileroot+ tag + ".list", listFile);
1349 listFileName = fileroot+ tag + ".list";
1351 float previousDist = 0.00000;
1352 float rndPreviousDist = 0.00000;
1358 double saveCutoff = cutoff;
1360 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1362 if (m->control_pressed) { //clean up
1363 delete matrix; delete list; delete cluster; delete rabund;
1365 m->mothurRemove(listFileName);
1366 return listFileName;
1369 cluster->update(saveCutoff);
1371 float dist = matrix->getSmallDist();
1374 rndDist = m->ceilDist(dist, precision);
1376 rndDist = m->roundDist(dist, precision);
1379 if(previousDist <= 0.0000 && dist != previousDist){
1380 oldList.setLabel("unique");
1381 oldList.print(listFile);
1382 if (labels.count("unique") == 0) { labels.insert("unique"); }
1384 else if(rndDist != rndPreviousDist){
1385 oldList.setLabel(toString(rndPreviousDist, length-1));
1386 oldList.print(listFile);
1387 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1390 previousDist = dist;
1391 rndPreviousDist = rndDist;
1396 if(previousDist <= 0.0000){
1397 oldList.setLabel("unique");
1398 oldList.print(listFile);
1399 if (labels.count("unique") == 0) { labels.insert("unique"); }
1401 else if(rndPreviousDist<cutoff){
1402 oldList.setLabel(toString(rndPreviousDist, length-1));
1403 oldList.print(listFile);
1404 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1407 delete matrix; delete list; delete cluster; delete rabund;
1408 matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1411 if (m->control_pressed) { //clean up
1412 m->mothurRemove(listFileName);
1413 return listFileName;
1416 m->mothurRemove(thisDistFile);
1417 m->mothurRemove(thisNamefile);
1419 if (saveCutoff != cutoff) {
1420 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1421 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1423 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1426 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1428 return listFileName;
1431 catch(exception& e) {
1432 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1436 //**********************************************************************************************************************
1438 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1443 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1448 string thisOutputDir = outputDir;
1449 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1450 map<string, string> variables;
1451 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
1452 string outputFileName = getOutputFileName("column", variables);
1453 m->mothurRemove(outputFileName);
1456 for (int i = 0; i < distNames.size(); i++) {
1457 if (m->control_pressed) { return 0; }
1459 string thisDistFile = distNames[i].begin()->first;
1461 m->appendFiles(thisDistFile, outputFileName);
1464 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1474 catch(exception& e) {
1475 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1479 //**********************************************************************************************************************
1480 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1482 rabund->setLabel(list->getLabel());
1483 for(int i = 0; i < list->getNumBins(); i++) {
1484 if (m->control_pressed) { break; }
1485 vector<string> binNames;
1486 string bin = list->get(i);
1487 m->splitAtComma(bin, binNames);
1489 for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]); }
1490 rabund->push_back(total);
1494 catch(exception& e) {
1495 m->errorOut(e, "ClusterCommand", "createRabund");
1500 //**********************************************************************************************************************