2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
13 //**********************************************************************************************************************
14 vector<string> ClusterSplitCommand::setParameters(){
16 CommandParameter pfile("file", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","",false,false,true); parameters.push_back(pfile);
17 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
18 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
19 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
20 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
21 CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
22 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
23 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
24 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "","",false,false,true); parameters.push_back(psplitmethod);
25 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
26 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
27 CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
28 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
29 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
30 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff);
31 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
32 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false); parameters.push_back(pmethod);
33 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
34 CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
35 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
36 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
38 vector<string> myArray;
39 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
43 m->errorOut(e, "ClusterSplitCommand", "setParameters");
47 //**********************************************************************************************************************
48 string ClusterSplitCommand::getHelpString(){
50 string helpString = "";
51 helpString += "The cluster.split command parameter options are file, fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, cluster, processors. Fasta or Phylip or column and name are required.\n";
52 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
53 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
54 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n";
55 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
56 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n";
57 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
58 helpString += "The file option allows you to enter your file containing your list of column and names/count files as well as the singleton file. This file is mothur generated, when you run cluster.split() with the cluster=f parameter. This can be helpful when you have a large dataset that you may be able to use all your processors for the splitting step, but have to reduce them for the cluster step due to RAM constraints. For example: cluster.split(fasta=yourFasta, taxonomy=yourTax, count=yourCount, taxlevel=3, cluster=f, processors=8) then cluster.split(file=yourFile, processors=4). This allows your to maximize your processors during the splitting step. Also, if you are unsure if the cluster step will have RAM issue with multiple processors, you can avoid running the first part of the command multiple times.\n";
59 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
60 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
61 helpString += "The name parameter allows you to enter your name file. \n";
62 helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
63 helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
64 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
65 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
66 helpString += "The method allows you to specify what clustering algorithm you want to use, default=average, option furthest, nearest, or average. \n";
67 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
68 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
69 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
70 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
71 helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic. It is only valid with splitmethod=fasta. Default=f.\n";
73 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
75 helpString += "The cluster.split command should be in the following format: \n";
76 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
77 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
81 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
85 //**********************************************************************************************************************
86 string ClusterSplitCommand::getOutputPattern(string type) {
90 if (type == "list") { pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
91 else if (type == "rabund") { pattern = "[filename],[clustertag],rabund"; }
92 else if (type == "sabund") { pattern = "[filename],[clustertag],sabund"; }
93 else if (type == "column") { pattern = "[filename],dist"; }
94 else if (type == "file") { pattern = "[filename],file"; }
95 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
100 m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
104 //**********************************************************************************************************************
105 ClusterSplitCommand::ClusterSplitCommand(){
107 abort = true; calledHelp = true;
109 vector<string> tempOutNames;
110 outputTypes["list"] = tempOutNames;
111 outputTypes["rabund"] = tempOutNames;
112 outputTypes["sabund"] = tempOutNames;
113 outputTypes["column"] = tempOutNames;
114 outputTypes["file"] = tempOutNames;
116 catch(exception& e) {
117 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
121 //**********************************************************************************************************************
122 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
123 ClusterSplitCommand::ClusterSplitCommand(string option) {
125 abort = false; calledHelp = false;
128 //allow user to run help
129 if(option == "help") { help(); abort = true; calledHelp = true; }
130 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
133 vector<string> myArray = setParameters();
135 OptionParser parser(option);
136 map<string,string> parameters = parser.getParameters();
138 ValidParameters validParameter("cluster.split");
140 //check to make sure all parameters are valid for command
141 map<string,string>::iterator it;
142 for (it = parameters.begin(); it != parameters.end(); it++) {
143 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
148 //initialize outputTypes
149 vector<string> tempOutNames;
150 outputTypes["list"] = tempOutNames;
151 outputTypes["rabund"] = tempOutNames;
152 outputTypes["sabund"] = tempOutNames;
153 outputTypes["column"] = tempOutNames;
154 outputTypes["file"] = tempOutNames;
156 //if the user changes the output directory command factory will send this info to us in the output parameter
157 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
159 //if the user changes the input directory command factory will send this info to us in the output parameter
160 string inputDir = validParameter.validFile(parameters, "inputdir", false);
161 if (inputDir == "not found"){ inputDir = ""; }
164 it = parameters.find("phylip");
165 //user has given a template file
166 if(it != parameters.end()){
167 path = m->hasPath(it->second);
168 //if the user has not given a path then, add inputdir. else leave path alone.
169 if (path == "") { parameters["phylip"] = inputDir + it->second; }
172 it = parameters.find("column");
173 //user has given a template file
174 if(it != parameters.end()){
175 path = m->hasPath(it->second);
176 //if the user has not given a path then, add inputdir. else leave path alone.
177 if (path == "") { parameters["column"] = inputDir + it->second; }
180 it = parameters.find("name");
181 //user has given a template file
182 if(it != parameters.end()){
183 path = m->hasPath(it->second);
184 //if the user has not given a path then, add inputdir. else leave path alone.
185 if (path == "") { parameters["name"] = inputDir + it->second; }
188 it = parameters.find("taxonomy");
189 //user has given a template file
190 if(it != parameters.end()){
191 path = m->hasPath(it->second);
192 //if the user has not given a path then, add inputdir. else leave path alone.
193 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
196 it = parameters.find("fasta");
197 //user has given a template file
198 if(it != parameters.end()){
199 path = m->hasPath(it->second);
200 //if the user has not given a path then, add inputdir. else leave path alone.
201 if (path == "") { parameters["fasta"] = inputDir + it->second; }
204 it = parameters.find("count");
205 //user has given a template file
206 if(it != parameters.end()){
207 path = m->hasPath(it->second);
208 //if the user has not given a path then, add inputdir. else leave path alone.
209 if (path == "") { parameters["count"] = inputDir + it->second; }
212 it = parameters.find("file");
213 //user has given a template file
214 if(it != parameters.end()){
215 path = m->hasPath(it->second);
216 //if the user has not given a path then, add inputdir. else leave path alone.
217 if (path == "") { parameters["file"] = inputDir + it->second; }
221 //check for required parameters
222 file = validParameter.validFile(parameters, "file", true);
223 if (file == "not open") { file = ""; abort = true; }
224 else if (file == "not found") { file = ""; }
225 else { distfile = file; }
227 phylipfile = validParameter.validFile(parameters, "phylip", true);
228 if (phylipfile == "not open") { abort = true; }
229 else if (phylipfile == "not found") { phylipfile = ""; }
230 else { distfile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
232 columnfile = validParameter.validFile(parameters, "column", true);
233 if (columnfile == "not open") { abort = true; }
234 else if (columnfile == "not found") { columnfile = ""; }
235 else { distfile = columnfile; format = "column"; m->setColumnFile(columnfile); }
237 namefile = validParameter.validFile(parameters, "name", true);
238 if (namefile == "not open") { abort = true; namefile = "";}
239 else if (namefile == "not found") { namefile = ""; }
240 else { m->setNameFile(namefile); }
242 countfile = validParameter.validFile(parameters, "count", true);
243 if (countfile == "not open") { abort = true; countfile = "";}
244 else if (countfile == "not found") { countfile = ""; }
245 else { m->setCountTableFile(countfile); }
247 fastafile = validParameter.validFile(parameters, "fasta", true);
248 if (fastafile == "not open") { abort = true; }
249 else if (fastafile == "not found") { fastafile = ""; }
250 else { distfile = fastafile; splitmethod = "fasta"; m->setFastaFile(fastafile); }
252 taxFile = validParameter.validFile(parameters, "taxonomy", true);
253 if (taxFile == "not open") { taxFile = ""; abort = true; }
254 else if (taxFile == "not found") { taxFile = ""; }
255 else { m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
257 if ((phylipfile == "") && (columnfile == "") && (fastafile == "") && (file == "")) {
258 //is there are current file available for either of these?
259 //give priority to column, then phylip, then fasta
260 columnfile = m->getColumnFile();
261 if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
263 phylipfile = m->getPhylipFile();
264 if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
266 fastafile = m->getFastaFile();
267 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
269 m->mothurOut("No valid current files. When executing a cluster.split command you must enter a file, phylip or a column or fastafile."); m->mothurOutEndLine();
275 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "") && (file != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: file, fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
277 if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
279 if (columnfile != "") {
280 if ((namefile == "") && (countfile == "")) {
281 namefile = m->getNameFile();
282 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
284 countfile = m->getCountTableFile();
285 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
287 m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
294 if (fastafile != "") {
296 taxFile = m->getTaxonomyFile();
297 if (taxFile != "") { m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
299 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
304 if ((namefile == "") && (countfile == "")) {
305 namefile = m->getNameFile();
306 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
308 countfile = m->getCountTableFile();
309 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
311 m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine();
318 //check for optional parameter and set defaults
319 // ...at some point should added some additional type checking...
320 //get user cutoff and precision or use defaults
322 temp = validParameter.validFile(parameters, "precision", false);
323 if (temp == "not found") { temp = "100"; }
324 //saves precision legnth for formatting below
325 length = temp.length();
326 m->mothurConvert(temp, precision);
328 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
329 hard = m->isTrue(temp);
331 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
332 large = m->isTrue(temp);
334 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
335 m->setProcessors(temp);
336 m->mothurConvert(temp, processors);
338 temp = validParameter.validFile(parameters, "splitmethod", false);
339 if ((splitmethod != "fasta") && (splitmethod != "classify")) {
340 if (temp == "not found") { splitmethod = "distance"; }
341 else { splitmethod = temp; }
344 temp = validParameter.validFile(parameters, "classic", false); if (temp == "not found") { temp = "F"; }
345 classic = m->isTrue(temp);
347 if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
349 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.25"; }
350 m->mothurConvert(temp, cutoff);
351 cutoff += (5 / (precision * 10.0));
353 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "3"; }
354 m->mothurConvert(temp, taxLevelCutoff);
356 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "average"; }
358 if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
359 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
361 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
362 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
364 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
366 showabund = validParameter.validFile(parameters, "showabund", false);
367 if (showabund == "not found") { showabund = "T"; }
369 temp = validParameter.validFile(parameters, "cluster", false); if (temp == "not found") { temp = "T"; }
370 runCluster = m->isTrue(temp);
372 timing = validParameter.validFile(parameters, "timing", false);
373 if (timing == "not found") { timing = "F"; }
377 catch(exception& e) {
378 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
383 //**********************************************************************************************************************
385 int ClusterSplitCommand::execute(){
388 if (abort == true) { if (calledHelp) { return 0; } return 2; }
391 vector<string> listFileNames;
392 vector< map<string, string> > distName;
394 string singletonName = "";
396 double saveCutoff = cutoff;
398 if (file != "") { deleteFiles = false; estart = time(NULL); singletonName = readFile(distName); }
401 //****************** file prep work ******************************//
406 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
407 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
409 if (pid == 0) { //only process 0 converts and splits
413 //if user gave a phylip file convert to column file
414 if (format == "phylip") {
416 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
418 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
420 NameAssignment* nameMap = NULL;
421 convert->setFormat("phylip");
422 convert->read(nameMap);
424 if (m->control_pressed) { delete convert; return 0; }
426 distfile = convert->getOutputFile();
428 //if no names file given with phylip file, create it
429 ListVector* listToMakeNameFile = convert->getListVector();
430 if ((namefile == "") && (countfile == "")) { //you need to make a namefile for split matrix
432 namefile = phylipfile + ".names";
433 m->openOutputFile(namefile, out);
434 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
435 string bin = listToMakeNameFile->get(i);
436 out << bin << '\t' << bin << endl;
440 delete listToMakeNameFile;
443 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
445 if (m->control_pressed) { return 0; }
448 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
450 //split matrix into non-overlapping groups
452 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large); }
453 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large); }
454 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir); }
455 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
459 if (m->control_pressed) { delete split; return 0; }
461 singletonName = split->getSingletonNames();
462 distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
465 if (m->debug) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); }
467 //output a merged distance file
468 //if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
470 if (m->control_pressed) { return 0; }
472 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
476 printFile(singletonName, distName);
478 m->mothurOutEndLine();
479 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
480 for (int i = 0; i < distName.size(); i++) { m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine(); }
481 m->mothurOutEndLine();
487 //****************** break up files between processes and cluster each file set ******************************//
489 ////you are process 0 from above////
491 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
492 dividedNames.resize(processors);
494 //for each file group figure out which process will complete it
495 //want to divide the load intelligently so the big files are spread between processes
496 for (int i = 0; i < distName.size(); i++) {
497 int processToAssign = (i+1) % processors;
498 if (processToAssign == 0) { processToAssign = processors; }
500 dividedNames[(processToAssign-1)].push_back(distName[i]);
503 //not lets reverse the order of ever other process, so we balance big files running with little ones
504 for (int i = 0; i < processors; i++) {
505 int remainder = ((i+1) % processors);
506 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
510 //send each child the list of files it needs to process
511 for(int i = 1; i < processors; i++) {
512 //send number of file pairs
513 int num = dividedNames[i].size();
514 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
516 for (int j = 0; j < num; j++) { //send filenames to process i
517 char tempDistFileName[1024];
518 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
519 int lengthDist = (dividedNames[i][j].begin()->first).length();
521 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
522 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
524 char tempNameFileName[1024];
525 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
526 int lengthName = (dividedNames[i][j].begin()->second).length();
528 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
529 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
534 listFileNames = cluster(dividedNames[0], labels);
536 //receive the other processes info
537 for(int i = 1; i < processors; i++) {
538 int num = dividedNames[i].size();
541 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
542 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
544 //send list filenames to root process
545 for (int j = 0; j < num; j++) {
547 char tempListFileName[1024];
549 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
550 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
552 string myListFileName = tempListFileName;
553 myListFileName = myListFileName.substr(0, lengthList);
555 listFileNames.push_back(myListFileName);
558 //send Labels to root process
560 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
562 for (int j = 0; j < numLabels; j++) {
566 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
567 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
569 string myLabel = tempLabel;
570 myLabel = myLabel.substr(0, lengthLabel);
572 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
576 }else { //you are a child process
577 vector < map<string, string> > myNames;
579 //recieve the files you need to process
580 //receive number of file pairs
582 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
586 for (int j = 0; j < num; j++) { //receive filenames to process
588 char tempDistFileName[1024];
590 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
591 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
593 string myDistFileName = tempDistFileName;
594 myDistFileName = myDistFileName.substr(0, lengthDist);
597 char tempNameFileName[1024];
599 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
600 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
602 string myNameFileName = tempNameFileName;
603 myNameFileName = myNameFileName.substr(0, lengthName);
606 myNames[j][myDistFileName] = myNameFileName;
610 listFileNames = cluster(myNames, labels);
613 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
615 //send list filenames to root process
616 for (int j = 0; j < num; j++) {
617 char tempListFileName[1024];
618 strcpy(tempListFileName, listFileNames[j].c_str());
619 int lengthList = listFileNames[j].length();
621 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
622 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
625 //send Labels to root process
626 int numLabels = labels.size();
627 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
629 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
631 strcpy(tempLabel, (*it).c_str());
632 int lengthLabel = (*it).length();
634 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
635 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
640 MPI_Barrier(MPI_COMM_WORLD);
643 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
645 if (processors > distName.size()) { processors = distName.size(); }
647 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
649 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
651 listFileNames = createProcesses(distName, labels);
654 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
657 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
659 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
661 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
663 //****************** merge list file and create rabund and sabund files ******************************//
665 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
668 if (pid == 0) { //only process 0 merges
671 ListVector* listSingle;
672 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
674 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
676 mergeLists(listFileNames, labelBins, listSingle);
678 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
680 //delete after all are complete incase a crash happens
681 if (!deleteFiles) { for (int i = 0; i < distName.size(); i++) { m->mothurRemove(distName[i].begin()->first); m->mothurRemove(distName[i].begin()->second); } }
683 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
685 //set list file as new current listfile
687 itTypes = outputTypes.find("list");
688 if (itTypes != outputTypes.end()) {
689 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
692 //set rabund file as new current rabundfile
693 itTypes = outputTypes.find("rabund");
694 if (itTypes != outputTypes.end()) {
695 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
698 //set sabund file as new current sabundfile
699 itTypes = outputTypes.find("sabund");
700 if (itTypes != outputTypes.end()) {
701 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
704 m->mothurOutEndLine();
705 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
706 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
707 m->mothurOutEndLine();
710 } //only process 0 merges
713 MPI_Barrier(MPI_COMM_WORLD);
718 catch(exception& e) {
719 m->errorOut(e, "ClusterSplitCommand", "execute");
723 //**********************************************************************************************************************
724 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
727 map<float, int> labelBin;
728 vector<float> orderFloat;
732 if (singleton != "none") {
735 m->openInputFile(singleton, in);
737 string firstCol, secondCol;
738 listSingle = new ListVector();
740 if (countfile != "") { m->getline(in); m->gobble(in); }
743 in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
744 if (countfile == "") { listSingle->push_back(secondCol); }
745 else { listSingle->push_back(firstCol); }
749 m->mothurRemove(singleton);
751 numSingleBins = listSingle->getNumBins();
752 }else{ listSingle = NULL; numSingleBins = 0; }
754 //go through users set and make them floats so we can sort them
755 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
758 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
759 else if (*it == "unique") { temp = -1.0; }
761 if (temp <= cutoff) {
762 orderFloat.push_back(temp);
763 labelBin[temp] = numSingleBins; //initialize numbins
768 sort(orderFloat.begin(), orderFloat.end());
771 //get the list info from each file
772 for (int k = 0; k < listNames.size(); k++) {
774 if (m->control_pressed) {
775 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton); }
776 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
780 InputData* input = new InputData(listNames[k], "list");
781 ListVector* list = input->getListVector();
782 string lastLabel = list->getLabel();
784 string filledInList = listNames[k] + "filledInTemp";
786 m->openOutputFile(filledInList, outFilled);
788 //for each label needed
789 for(int l = 0; l < orderFloat.size(); l++){
792 if (orderFloat[l] == -1) { thisLabel = "unique"; }
793 else { thisLabel = toString(orderFloat[l], length-1); }
795 //this file has reached the end
797 list = input->getListVector(lastLabel, true);
798 }else{ //do you have the distance, or do you need to fill in
801 if (list->getLabel() == "unique") { labelFloat = -1.0; }
802 else { convert(list->getLabel(), labelFloat); }
804 //check for missing labels
805 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
806 //if its bigger get last label, otherwise keep it
808 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
810 lastLabel = list->getLabel();
814 list->setLabel(thisLabel);
815 list->print(outFilled);
818 labelBin[orderFloat[l]] += list->getNumBins();
822 list = input->getListVector();
825 if (list != NULL) { delete list; }
829 m->mothurRemove(listNames[k]);
830 rename(filledInList.c_str(), listNames[k].c_str());
835 catch(exception& e) {
836 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
840 //**********************************************************************************************************************
841 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
843 if (outputDir == "") { outputDir += m->hasPath(distfile); }
844 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
846 map<string, string> variables;
847 variables["[filename]"] = fileroot;
848 variables["[clustertag]"] = tag;
849 string sabundFileName = getOutputFileName("sabund", variables);
850 string rabundFileName = getOutputFileName("rabund", variables);
851 if (countfile != "") { variables["[tag2]"] = "unique_list"; }
852 string listFileName = getOutputFileName("list", variables);
854 if (countfile == "") {
855 m->openOutputFile(sabundFileName, outSabund);
856 m->openOutputFile(rabundFileName, outRabund);
857 outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
858 outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
861 m->openOutputFile(listFileName, outList);
862 outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
864 map<float, int>::iterator itLabel;
866 //clears out junk for autocompleting of list files above. Perhaps there is a beter way to handle this from within the data structure?
867 m->printedListHeaders = false;
869 //for each label needed
870 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
873 if (itLabel->first == -1) { thisLabel = "unique"; }
874 else { thisLabel = toString(itLabel->first, length-1); }
876 //outList << thisLabel << '\t' << itLabel->second << '\t';
878 RAbundVector* rabund = NULL;
879 ListVector completeList;
880 completeList.setLabel(thisLabel);
882 if (countfile == "") {
883 rabund = new RAbundVector();
884 rabund->setLabel(thisLabel);
888 if (listSingle != NULL) {
889 for (int j = 0; j < listSingle->getNumBins(); j++) {
890 //outList << listSingle->get(j) << '\t';
891 completeList.push_back(listSingle->get(j));
892 if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
896 //get the list info from each file
897 for (int k = 0; k < listNames.size(); k++) {
899 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } if (rabund != NULL) { delete rabund; } return 0; }
901 InputData* input = new InputData(listNames[k], "list");
902 ListVector* list = input->getListVector(thisLabel);
904 //this file has reached the end
905 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
907 for (int j = 0; j < list->getNumBins(); j++) {
908 //outList << list->get(j) << '\t';
909 completeList.push_back(list->get(j));
910 if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
917 if (countfile == "") {
918 SAbundVector sabund = rabund->getSAbundVector();
919 sabund.print(outSabund);
920 rabund->print(outRabund);
923 if (!m->printedListHeaders) {
924 m->listBinLabelsInFile.clear(); completeList.printHeaders(outList); }
925 completeList.print(outList);
927 if (rabund != NULL) { delete rabund; }
931 if (countfile == "") {
935 if (listSingle != NULL) { delete listSingle; }
937 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
941 catch(exception& e) {
942 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
947 //**********************************************************************************************************************
949 void ClusterSplitCommand::printData(ListVector* oldList){
951 string label = oldList->getLabel();
952 RAbundVector oldRAbund = oldList->getRAbundVector();
954 oldRAbund.setLabel(label);
955 if (m->isTrue(showabund)) {
956 oldRAbund.getSAbundVector().print(cout);
958 oldRAbund.print(outRabund);
959 oldRAbund.getSAbundVector().print(outSabund);
961 oldList->print(outList);
963 catch(exception& e) {
964 m->errorOut(e, "ClusterSplitCommand", "printData");
968 //**********************************************************************************************************************
969 vector<string> ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
972 vector<string> listFiles;
973 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
974 dividedNames.resize(processors);
976 //for each file group figure out which process will complete it
977 //want to divide the load intelligently so the big files are spread between processes
978 for (int i = 0; i < distName.size(); i++) {
980 int processToAssign = (i+1) % processors;
981 if (processToAssign == 0) { processToAssign = processors; }
983 dividedNames[(processToAssign-1)].push_back(distName[i]);
984 if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
987 //now lets reverse the order of ever other process, so we balance big files running with little ones
988 for (int i = 0; i < processors; i++) {
990 int remainder = ((i+1) % processors);
991 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
994 if (m->control_pressed) { return listFiles; }
996 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1000 //loop through and create all the processes you want
1001 while (process != processors) {
1005 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
1007 }else if (pid == 0){
1009 vector<string> listFileNames = cluster(dividedNames[process], labels);
1011 //write out names to file
1012 string filename = m->mothurGetpid(process) + ".temp";
1014 m->openOutputFile(filename, out);
1016 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
1021 filename = m->mothurGetpid(process) + ".temp.labels";
1022 m->openOutputFile(filename, outLabels);
1024 outLabels << cutoff << endl;
1025 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
1026 outLabels << (*it) << endl;
1032 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1033 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1039 listFiles = cluster(dividedNames[0], labels);
1041 //force parent to wait until all the processes are done
1042 for (int i=0;i< processIDS.size();i++) {
1043 int temp = processIDS[i];
1047 //get list of list file names from each process
1048 for(int i=0;i<processIDS.size();i++){
1049 string filename = toString(processIDS[i]) + ".temp";
1051 m->openInputFile(filename, in);
1053 in >> tag; m->gobble(in);
1057 in >> tempName; m->gobble(in);
1058 listFiles.push_back(tempName);
1061 m->mothurRemove((toString(processIDS[i]) + ".temp"));
1064 filename = toString(processIDS[i]) + ".temp.labels";
1066 m->openInputFile(filename, in2);
1069 in2 >> tempCutoff; m->gobble(in2);
1070 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
1074 in2 >> tempName; m->gobble(in2);
1075 if (labels.count(tempName) == 0) { labels.insert(tempName); }
1078 m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
1084 //////////////////////////////////////////////////////////////////////////////////////////////////////
1085 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
1086 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1087 //Taking advantage of shared memory to allow both threads to add labels.
1088 //////////////////////////////////////////////////////////////////////////////////////////////////////
1090 vector<clusterData*> pDataArray;
1091 DWORD dwThreadIdArray[processors-1];
1092 HANDLE hThreadArray[processors-1];
1094 //Create processor worker threads.
1095 for( int i=1; i<processors; i++ ){
1096 // Allocate memory for thread data.
1097 clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
1098 pDataArray.push_back(tempCluster);
1099 processIDS.push_back(i);
1101 //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
1102 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
1103 hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
1108 listFiles = cluster(dividedNames[0], labels);
1110 //Wait until all threads have terminated.
1111 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1113 //Close all thread handles and free memory allocations.
1114 for(int i=0; i < pDataArray.size(); i++){
1116 tag = pDataArray[i]->tag;
1117 //get listfiles created
1118 for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1120 set<string>::iterator it;
1121 for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1123 if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1124 CloseHandle(hThreadArray[i]);
1125 delete pDataArray[i];
1133 catch(exception& e) {
1134 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1138 //**********************************************************************************************************************
1140 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1143 vector<string> listFileNames;
1144 double smallestCutoff = cutoff;
1146 //cluster each distance file
1147 for (int i = 0; i < distNames.size(); i++) {
1149 string thisNamefile = distNames[i].begin()->second;
1150 string thisDistFile = distNames[i].begin()->first;
1152 string listFileName = "";
1153 if (classic) { listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
1154 else { listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
1156 if (m->control_pressed) { //clean up
1157 for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); }
1158 listFileNames.clear(); return listFileNames;
1161 listFileNames.push_back(listFileName);
1164 cutoff = smallestCutoff;
1166 return listFileNames;
1169 catch(exception& e) {
1170 m->errorOut(e, "ClusterSplitCommand", "cluster");
1176 //**********************************************************************************************************************
1177 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1179 string listFileName = "";
1181 ListVector* list = NULL;
1183 RAbundVector* rabund = NULL;
1187 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1189 //output your files too
1191 cout << endl << "Reading " << thisDistFile << endl;
1195 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1197 //reads phylip file storing data in 2D vector, also fills list and rabund
1199 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1201 NameAssignment* nameMap = NULL;
1202 CountTable* ct = NULL;
1204 nameMap = new NameAssignment(thisNamefile);
1206 cluster->readPhylipFile(thisDistFile, nameMap);
1207 }else if (countfile != "") {
1208 ct = new CountTable();
1209 ct->readTable(thisNamefile, false, false);
1210 cluster->readPhylipFile(thisDistFile, ct);
1212 tag = cluster->getTag();
1214 if (m->control_pressed) { if(namefile != ""){ delete nameMap; }
1215 else { delete ct; } delete cluster; return 0; }
1217 list = cluster->getListVector();
1218 rabund = cluster->getRAbundVector();
1220 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1221 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1222 listFileName = fileroot+ tag + ".list";
1225 m->openOutputFile(fileroot+ tag + ".list", listFile);
1227 float previousDist = 0.00000;
1228 float rndPreviousDist = 0.00000;
1232 //output your files too
1234 cout << endl << "Clustering " << thisDistFile << endl;
1238 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1240 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1241 if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close(); if(namefile != ""){ delete nameMap; }
1242 else { delete ct; } return listFileName; }
1244 cluster->update(cutoff);
1246 float dist = cluster->getSmallDist();
1249 rndDist = m->ceilDist(dist, precision);
1251 rndDist = m->roundDist(dist, precision);
1254 if(previousDist <= 0.0000 && dist != previousDist){
1255 oldList.setLabel("unique");
1256 oldList.print(listFile);
1257 if (labels.count("unique") == 0) { labels.insert("unique"); }
1259 else if(rndDist != rndPreviousDist){
1260 oldList.setLabel(toString(rndPreviousDist, length-1));
1261 oldList.print(listFile);
1262 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1266 previousDist = dist;
1267 rndPreviousDist = rndDist;
1271 if(previousDist <= 0.0000){
1272 oldList.setLabel("unique");
1273 oldList.print(listFile);
1274 if (labels.count("unique") == 0) { labels.insert("unique"); }
1276 else if(rndPreviousDist<cutoff){
1277 oldList.setLabel(toString(rndPreviousDist, length-1));
1278 oldList.print(listFile);
1279 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1285 delete cluster; delete list; delete rabund;
1286 if(namefile != ""){ delete nameMap; }
1290 m->mothurRemove(thisDistFile);
1291 m->mothurRemove(thisNamefile);
1293 return listFileName;
1296 catch(exception& e) {
1297 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1302 //**********************************************************************************************************************
1303 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1305 string listFileName = "";
1307 Cluster* cluster = NULL;
1308 SparseDistanceMatrix* matrix = NULL;
1309 ListVector* list = NULL;
1311 RAbundVector* rabund = NULL;
1313 if (m->control_pressed) { return listFileName; }
1317 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1319 //output your files too
1321 cout << endl << "Reading " << thisDistFile << endl;
1325 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1327 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1328 read->setCutoff(cutoff);
1330 NameAssignment* nameMap = NULL;
1331 CountTable* ct = NULL;
1333 nameMap = new NameAssignment(thisNamefile);
1335 read->read(nameMap);
1336 }else if (countfile != "") {
1337 ct = new CountTable();
1338 ct->readTable(thisNamefile, false, false);
1340 }else { read->read(nameMap); }
1342 list = read->getListVector();
1344 matrix = read->getDMatrix();
1346 if(countfile != "") {
1347 rabund = new RAbundVector();
1348 createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
1350 }else { rabund = new RAbundVector(list->getRAbundVector()); }
1352 delete read; read = NULL;
1353 if (namefile != "") { delete nameMap; nameMap = NULL; }
1357 //output your files too
1359 cout << endl << "Clustering " << thisDistFile << endl;
1363 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1366 float adjust = -1.0;
1367 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method, adjust); }
1368 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method, adjust); }
1369 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method, adjust); }
1370 tag = cluster->getTag();
1372 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1373 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1376 m->openOutputFile(fileroot+ tag + ".list", listFile);
1378 listFileName = fileroot+ tag + ".list";
1380 float previousDist = 0.00000;
1381 float rndPreviousDist = 0.00000;
1387 double saveCutoff = cutoff;
1389 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1391 if (m->control_pressed) { //clean up
1392 delete matrix; delete list; delete cluster; delete rabund;
1394 m->mothurRemove(listFileName);
1395 return listFileName;
1398 cluster->update(saveCutoff);
1400 float dist = matrix->getSmallDist();
1403 rndDist = m->ceilDist(dist, precision);
1405 rndDist = m->roundDist(dist, precision);
1408 if(previousDist <= 0.0000 && dist != previousDist){
1409 oldList.setLabel("unique");
1410 oldList.print(listFile);
1411 if (labels.count("unique") == 0) { labels.insert("unique"); }
1413 else if(rndDist != rndPreviousDist){
1414 oldList.setLabel(toString(rndPreviousDist, length-1));
1415 oldList.print(listFile);
1416 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1419 previousDist = dist;
1420 rndPreviousDist = rndDist;
1425 if(previousDist <= 0.0000){
1426 oldList.setLabel("unique");
1427 oldList.print(listFile);
1428 if (labels.count("unique") == 0) { labels.insert("unique"); }
1430 else if(rndPreviousDist<cutoff){
1431 oldList.setLabel(toString(rndPreviousDist, length-1));
1432 oldList.print(listFile);
1433 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1436 delete matrix; delete list; delete cluster; delete rabund;
1437 matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1440 if (m->control_pressed) { //clean up
1441 m->mothurRemove(listFileName);
1442 return listFileName;
1446 m->mothurRemove(thisDistFile);
1447 m->mothurRemove(thisNamefile);
1450 if (saveCutoff != cutoff) {
1451 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1452 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1454 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1457 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1459 return listFileName;
1462 catch(exception& e) {
1463 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1467 //**********************************************************************************************************************
1469 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1474 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1479 string thisOutputDir = outputDir;
1480 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1481 map<string, string> variables;
1482 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
1483 string outputFileName = getOutputFileName("column", variables);
1484 m->mothurRemove(outputFileName);
1487 for (int i = 0; i < distNames.size(); i++) {
1488 if (m->control_pressed) { return 0; }
1490 string thisDistFile = distNames[i].begin()->first;
1492 m->appendFiles(thisDistFile, outputFileName);
1495 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1505 catch(exception& e) {
1506 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1510 //**********************************************************************************************************************
1511 int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
1513 rabund->setLabel(list->getLabel());
1514 for(int i = 0; i < list->getNumBins(); i++) {
1515 if (m->control_pressed) { break; }
1516 vector<string> binNames;
1517 string bin = list->get(i);
1518 m->splitAtComma(bin, binNames);
1520 for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]); }
1521 rabund->push_back(total);
1525 catch(exception& e) {
1526 m->errorOut(e, "ClusterCommand", "createRabund");
1531 //**********************************************************************************************************************
1532 int ClusterSplitCommand::printFile(string singleton, vector< map<string, string> >& distName){
1535 map<string, string> variables;
1536 string thisOutputDir = outputDir;
1537 if (outputDir == "") { thisOutputDir = m->hasPath(distfile); }
1538 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(distfile));
1539 string outputFileName = getOutputFileName("file", variables);
1540 m->openOutputFile(outputFileName, out);
1542 out << singleton << endl;
1543 if (namefile != "") { out << "name" << endl; }
1544 else if (countfile != "") { out << "count" << endl; }
1545 else { out << "unknown" << endl; }
1547 for (int i = 0; i < distName.size(); i++) { out << distName[i].begin()->first << '\t' << distName[i].begin()->second << endl; }
1552 catch(exception& e) {
1553 m->errorOut(e, "ClusterCommand", "printFile");
1558 //**********************************************************************************************************************
1559 string ClusterSplitCommand::readFile(vector< map<string, string> >& distName){
1561 string singleton, thiscolumn, thisname, type;
1564 m->openInputFile(file, in);
1566 in >> singleton; m->gobble(in);
1567 in >> type; m->gobble(in);
1569 if (type == "name") { namefile = "name"; }
1570 else if (type == "count") { countfile = "count"; }
1571 else { m->mothurOut("[ERROR]: unknown file type. Are the files in column 2 of the file name files or count files? Please change unknown to name or count.\n"); m->control_pressed = true; }
1575 if (m->control_pressed) { break; }
1577 in >> thiscolumn; m->gobble(in);
1578 in >> thisname; m->gobble(in);
1580 map<string, string> temp;
1581 temp[thiscolumn] = thisname;
1582 distName.push_back(temp);
1588 catch(exception& e) {
1589 m->errorOut(e, "ClusterCommand", "readFile");
1594 //**********************************************************************************************************************