2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
13 //**********************************************************************************************************************
14 vector<string> ClusterSplitCommand::setParameters(){
16 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
17 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
18 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
19 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
20 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
21 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
22 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
23 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
24 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
25 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
26 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
27 CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff);
28 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
29 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
30 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
31 CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pclassic);
32 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
33 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
35 vector<string> myArray;
36 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
40 m->errorOut(e, "ClusterSplitCommand", "setParameters");
44 //**********************************************************************************************************************
45 string ClusterSplitCommand::getHelpString(){
47 string helpString = "";
48 helpString += "The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n";
49 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
50 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
51 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n";
52 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n";
53 helpString += "For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n";
54 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
55 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
56 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
57 helpString += "The name parameter allows you to enter your name file and is required if your distance file is in column format. \n";
58 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
59 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
60 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
61 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
62 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
63 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
64 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
65 helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic. It is only valid with splitmethod=fasta. Default=f.\n";
67 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
69 helpString += "The cluster.split command should be in the following format: \n";
70 helpString += "cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
71 helpString += "Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n";
75 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
79 //**********************************************************************************************************************
80 ClusterSplitCommand::ClusterSplitCommand(){
82 abort = true; calledHelp = true;
84 vector<string> tempOutNames;
85 outputTypes["list"] = tempOutNames;
86 outputTypes["rabund"] = tempOutNames;
87 outputTypes["sabund"] = tempOutNames;
88 outputTypes["column"] = tempOutNames;
91 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
95 //**********************************************************************************************************************
96 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
97 ClusterSplitCommand::ClusterSplitCommand(string option) {
99 abort = false; calledHelp = false;
102 //allow user to run help
103 if(option == "help") { help(); abort = true; calledHelp = true; }
104 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
107 vector<string> myArray = setParameters();
109 OptionParser parser(option);
110 map<string,string> parameters = parser.getParameters();
112 ValidParameters validParameter("cluster.split");
114 //check to make sure all parameters are valid for command
115 map<string,string>::iterator it;
116 for (it = parameters.begin(); it != parameters.end(); it++) {
117 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
122 //initialize outputTypes
123 vector<string> tempOutNames;
124 outputTypes["list"] = tempOutNames;
125 outputTypes["rabund"] = tempOutNames;
126 outputTypes["sabund"] = tempOutNames;
127 outputTypes["column"] = tempOutNames;
129 //if the user changes the output directory command factory will send this info to us in the output parameter
130 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
132 //if the user changes the input directory command factory will send this info to us in the output parameter
133 string inputDir = validParameter.validFile(parameters, "inputdir", false);
134 if (inputDir == "not found"){ inputDir = ""; }
137 it = parameters.find("phylip");
138 //user has given a template file
139 if(it != parameters.end()){
140 path = m->hasPath(it->second);
141 //if the user has not given a path then, add inputdir. else leave path alone.
142 if (path == "") { parameters["phylip"] = inputDir + it->second; }
145 it = parameters.find("column");
146 //user has given a template file
147 if(it != parameters.end()){
148 path = m->hasPath(it->second);
149 //if the user has not given a path then, add inputdir. else leave path alone.
150 if (path == "") { parameters["column"] = inputDir + it->second; }
153 it = parameters.find("name");
154 //user has given a template file
155 if(it != parameters.end()){
156 path = m->hasPath(it->second);
157 //if the user has not given a path then, add inputdir. else leave path alone.
158 if (path == "") { parameters["name"] = inputDir + it->second; }
161 it = parameters.find("taxonomy");
162 //user has given a template file
163 if(it != parameters.end()){
164 path = m->hasPath(it->second);
165 //if the user has not given a path then, add inputdir. else leave path alone.
166 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
169 it = parameters.find("fasta");
170 //user has given a template file
171 if(it != parameters.end()){
172 path = m->hasPath(it->second);
173 //if the user has not given a path then, add inputdir. else leave path alone.
174 if (path == "") { parameters["fasta"] = inputDir + it->second; }
178 //check for required parameters
179 phylipfile = validParameter.validFile(parameters, "phylip", true);
180 if (phylipfile == "not open") { abort = true; }
181 else if (phylipfile == "not found") { phylipfile = ""; }
182 else { distfile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
184 columnfile = validParameter.validFile(parameters, "column", true);
185 if (columnfile == "not open") { abort = true; }
186 else if (columnfile == "not found") { columnfile = ""; }
187 else { distfile = columnfile; format = "column"; m->setColumnFile(columnfile); }
189 namefile = validParameter.validFile(parameters, "name", true);
190 if (namefile == "not open") { abort = true; }
191 else if (namefile == "not found") { namefile = ""; }
192 else { m->setNameFile(namefile); }
194 fastafile = validParameter.validFile(parameters, "fasta", true);
195 if (fastafile == "not open") { abort = true; }
196 else if (fastafile == "not found") { fastafile = ""; }
197 else { distfile = fastafile; splitmethod = "fasta"; m->setFastaFile(fastafile); }
199 taxFile = validParameter.validFile(parameters, "taxonomy", true);
200 if (taxFile == "not open") { taxFile = ""; abort = true; }
201 else if (taxFile == "not found") { taxFile = ""; }
202 else { m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
204 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
205 //is there are current file available for either of these?
206 //give priority to column, then phylip, then fasta
207 columnfile = m->getColumnFile();
208 if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
210 phylipfile = m->getPhylipFile();
211 if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
213 fastafile = m->getFastaFile();
214 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
216 m->mothurOut("No valid current files. When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine();
222 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
224 if (columnfile != "") {
225 if (namefile == "") {
226 namefile = m->getNameFile();
227 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
229 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine();
235 if (fastafile != "") {
237 taxFile = m->getTaxonomyFile();
238 if (taxFile != "") { m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
240 m->mothurOut("You need to provide a taxonomy file if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
245 if (namefile == "") {
246 namefile = m->getNameFile();
247 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
249 m->mothurOut("You need to provide a namefile if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine();
255 //check for optional parameter and set defaults
256 // ...at some point should added some additional type checking...
257 //get user cutoff and precision or use defaults
259 temp = validParameter.validFile(parameters, "precision", false);
260 if (temp == "not found") { temp = "100"; }
261 //saves precision legnth for formatting below
262 length = temp.length();
263 m->mothurConvert(temp, precision);
265 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
266 hard = m->isTrue(temp);
268 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
269 large = m->isTrue(temp);
271 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
272 m->setProcessors(temp);
273 m->mothurConvert(temp, processors);
275 temp = validParameter.validFile(parameters, "splitmethod", false);
276 if ((splitmethod != "fasta") && (splitmethod != "classify")) {
277 if (temp == "not found") { splitmethod = "distance"; }
278 else { splitmethod = temp; }
281 temp = validParameter.validFile(parameters, "classic", false); if (temp == "not found") { temp = "F"; }
282 classic = m->isTrue(temp);
284 if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
286 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.25"; }
287 m->mothurConvert(temp, cutoff);
288 cutoff += (5 / (precision * 10.0));
290 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "3"; }
291 m->mothurConvert(temp, taxLevelCutoff);
293 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "average"; }
295 if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
296 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
298 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
299 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
301 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
303 showabund = validParameter.validFile(parameters, "showabund", false);
304 if (showabund == "not found") { showabund = "T"; }
306 timing = validParameter.validFile(parameters, "timing", false);
307 if (timing == "not found") { timing = "F"; }
311 catch(exception& e) {
312 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
317 //**********************************************************************************************************************
319 int ClusterSplitCommand::execute(){
322 if (abort == true) { if (calledHelp) { return 0; } return 2; }
325 vector<string> listFileNames;
327 string singletonName = "";
328 double saveCutoff = cutoff;
330 //****************** file prep work ******************************//
335 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
336 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
338 if (pid == 0) { //only process 0 converts and splits
342 //if user gave a phylip file convert to column file
343 if (format == "phylip") {
345 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
347 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
349 NameAssignment* nameMap = NULL;
350 convert->setFormat("phylip");
351 convert->read(nameMap);
353 if (m->control_pressed) { delete convert; return 0; }
355 distfile = convert->getOutputFile();
357 //if no names file given with phylip file, create it
358 ListVector* listToMakeNameFile = convert->getListVector();
359 if (namefile == "") { //you need to make a namefile for split matrix
361 namefile = phylipfile + ".names";
362 m->openOutputFile(namefile, out);
363 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
364 string bin = listToMakeNameFile->get(i);
365 out << bin << '\t' << bin << endl;
369 delete listToMakeNameFile;
372 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
374 if (m->control_pressed) { return 0; }
377 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
379 //split matrix into non-overlapping groups
381 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); }
382 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); }
383 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir); }
384 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
388 if (m->control_pressed) { delete split; return 0; }
390 singletonName = split->getSingletonNames();
391 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
394 //output a merged distance file
395 if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
398 if (m->control_pressed) { return 0; }
400 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
403 //****************** break up files between processes and cluster each file set ******************************//
405 ////you are process 0 from above////
407 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
408 dividedNames.resize(processors);
410 //for each file group figure out which process will complete it
411 //want to divide the load intelligently so the big files are spread between processes
412 for (int i = 0; i < distName.size(); i++) {
413 int processToAssign = (i+1) % processors;
414 if (processToAssign == 0) { processToAssign = processors; }
416 dividedNames[(processToAssign-1)].push_back(distName[i]);
419 //not lets reverse the order of ever other process, so we balance big files running with little ones
420 for (int i = 0; i < processors; i++) {
421 int remainder = ((i+1) % processors);
422 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
426 //send each child the list of files it needs to process
427 for(int i = 1; i < processors; i++) {
428 //send number of file pairs
429 int num = dividedNames[i].size();
430 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
432 for (int j = 0; j < num; j++) { //send filenames to process i
433 char tempDistFileName[1024];
434 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
435 int lengthDist = (dividedNames[i][j].begin()->first).length();
437 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
438 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
440 char tempNameFileName[1024];
441 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
442 int lengthName = (dividedNames[i][j].begin()->second).length();
444 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
445 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
450 listFileNames = cluster(dividedNames[0], labels);
452 //receive the other processes info
453 for(int i = 1; i < processors; i++) {
454 int num = dividedNames[i].size();
457 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
458 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
460 //send list filenames to root process
461 for (int j = 0; j < num; j++) {
463 char tempListFileName[1024];
465 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
466 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
468 string myListFileName = tempListFileName;
469 myListFileName = myListFileName.substr(0, lengthList);
471 listFileNames.push_back(myListFileName);
474 //send Labels to root process
476 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
478 for (int j = 0; j < numLabels; j++) {
482 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
483 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
485 string myLabel = tempLabel;
486 myLabel = myLabel.substr(0, lengthLabel);
488 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
492 }else { //you are a child process
493 vector < map<string, string> > myNames;
495 //recieve the files you need to process
496 //receive number of file pairs
498 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
502 for (int j = 0; j < num; j++) { //receive filenames to process
504 char tempDistFileName[1024];
506 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
507 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
509 string myDistFileName = tempDistFileName;
510 myDistFileName = myDistFileName.substr(0, lengthDist);
513 char tempNameFileName[1024];
515 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
516 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
518 string myNameFileName = tempNameFileName;
519 myNameFileName = myNameFileName.substr(0, lengthName);
522 myNames[j][myDistFileName] = myNameFileName;
526 listFileNames = cluster(myNames, labels);
529 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
531 //send list filenames to root process
532 for (int j = 0; j < num; j++) {
533 char tempListFileName[1024];
534 strcpy(tempListFileName, listFileNames[j].c_str());
535 int lengthList = listFileNames[j].length();
537 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
538 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
541 //send Labels to root process
542 int numLabels = labels.size();
543 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
545 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
547 strcpy(tempLabel, (*it).c_str());
548 int lengthLabel = (*it).length();
550 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
551 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
556 MPI_Barrier(MPI_COMM_WORLD);
559 ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
561 if (processors > distName.size()) { processors = distName.size(); }
563 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
565 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
567 listFileNames = createProcesses(distName, labels);
570 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
573 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
575 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
577 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
579 //****************** merge list file and create rabund and sabund files ******************************//
581 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
584 if (pid == 0) { //only process 0 merges
587 ListVector* listSingle;
588 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
590 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
592 mergeLists(listFileNames, labelBins, listSingle);
594 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
596 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
598 //set list file as new current listfile
600 itTypes = outputTypes.find("list");
601 if (itTypes != outputTypes.end()) {
602 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
605 //set rabund file as new current rabundfile
606 itTypes = outputTypes.find("rabund");
607 if (itTypes != outputTypes.end()) {
608 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
611 //set sabund file as new current sabundfile
612 itTypes = outputTypes.find("sabund");
613 if (itTypes != outputTypes.end()) {
614 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
617 m->mothurOutEndLine();
618 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
619 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
620 m->mothurOutEndLine();
623 } //only process 0 merges
626 MPI_Barrier(MPI_COMM_WORLD);
631 catch(exception& e) {
632 m->errorOut(e, "ClusterSplitCommand", "execute");
636 //**********************************************************************************************************************
637 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
640 map<float, int> labelBin;
641 vector<float> orderFloat;
645 if (singleton != "none") {
647 m->openInputFile(singleton, in);
649 string firstCol, secondCol;
650 listSingle = new ListVector();
652 in >> firstCol >> secondCol; m->gobble(in);
653 listSingle->push_back(secondCol);
656 m->mothurRemove(singleton);
658 numSingleBins = listSingle->getNumBins();
659 }else{ listSingle = NULL; numSingleBins = 0; }
661 //go through users set and make them floats so we can sort them
662 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
665 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
666 else if (*it == "unique") { temp = -1.0; }
668 if (temp <= cutoff) {
669 orderFloat.push_back(temp);
670 labelBin[temp] = numSingleBins; //initialize numbins
675 sort(orderFloat.begin(), orderFloat.end());
678 //get the list info from each file
679 for (int k = 0; k < listNames.size(); k++) {
681 if (m->control_pressed) {
682 if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton); }
683 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
687 InputData* input = new InputData(listNames[k], "list");
688 ListVector* list = input->getListVector();
689 string lastLabel = list->getLabel();
691 string filledInList = listNames[k] + "filledInTemp";
693 m->openOutputFile(filledInList, outFilled);
695 //for each label needed
696 for(int l = 0; l < orderFloat.size(); l++){
699 if (orderFloat[l] == -1) { thisLabel = "unique"; }
700 else { thisLabel = toString(orderFloat[l], length-1); }
702 //this file has reached the end
704 list = input->getListVector(lastLabel, true);
705 }else{ //do you have the distance, or do you need to fill in
708 if (list->getLabel() == "unique") { labelFloat = -1.0; }
709 else { convert(list->getLabel(), labelFloat); }
711 //check for missing labels
712 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
713 //if its bigger get last label, otherwise keep it
715 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
717 lastLabel = list->getLabel();
721 list->setLabel(thisLabel);
722 list->print(outFilled);
725 labelBin[orderFloat[l]] += list->getNumBins();
729 list = input->getListVector();
732 if (list != NULL) { delete list; }
736 m->mothurRemove(listNames[k]);
737 rename(filledInList.c_str(), listNames[k].c_str());
742 catch(exception& e) {
743 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
747 //**********************************************************************************************************************
748 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
750 if (outputDir == "") { outputDir += m->hasPath(distfile); }
751 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
753 m->openOutputFile(fileroot+ tag + ".sabund", outSabund);
754 m->openOutputFile(fileroot+ tag + ".rabund", outRabund);
755 m->openOutputFile(fileroot+ tag + ".list", outList);
757 outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["list"].push_back(fileroot+ tag + ".list");
758 outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
759 outputNames.push_back(fileroot+ tag + ".list"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
761 map<float, int>::iterator itLabel;
763 //for each label needed
764 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
767 if (itLabel->first == -1) { thisLabel = "unique"; }
768 else { thisLabel = toString(itLabel->first, length-1); }
770 outList << thisLabel << '\t' << itLabel->second << '\t';
772 RAbundVector* rabund = new RAbundVector();
773 rabund->setLabel(thisLabel);
776 if (listSingle != NULL) {
777 for (int j = 0; j < listSingle->getNumBins(); j++) {
778 outList << listSingle->get(j) << '\t';
779 rabund->push_back(m->getNumNames(listSingle->get(j)));
783 //get the list info from each file
784 for (int k = 0; k < listNames.size(); k++) {
786 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } delete rabund; return 0; }
788 InputData* input = new InputData(listNames[k], "list");
789 ListVector* list = input->getListVector(thisLabel);
791 //this file has reached the end
792 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
794 for (int j = 0; j < list->getNumBins(); j++) {
795 outList << list->get(j) << '\t';
796 rabund->push_back(m->getNumNames(list->get(j)));
803 SAbundVector sabund = rabund->getSAbundVector();
805 sabund.print(outSabund);
806 rabund->print(outRabund);
816 if (listSingle != NULL) { delete listSingle; }
818 for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
822 catch(exception& e) {
823 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
828 //**********************************************************************************************************************
830 void ClusterSplitCommand::printData(ListVector* oldList){
832 string label = oldList->getLabel();
833 RAbundVector oldRAbund = oldList->getRAbundVector();
835 oldRAbund.setLabel(label);
836 if (m->isTrue(showabund)) {
837 oldRAbund.getSAbundVector().print(cout);
839 oldRAbund.print(outRabund);
840 oldRAbund.getSAbundVector().print(outSabund);
842 oldList->print(outList);
844 catch(exception& e) {
845 m->errorOut(e, "ClusterSplitCommand", "printData");
849 //**********************************************************************************************************************
850 vector<string> ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
853 vector<string> listFiles;
854 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
855 dividedNames.resize(processors);
857 //for each file group figure out which process will complete it
858 //want to divide the load intelligently so the big files are spread between processes
859 for (int i = 0; i < distName.size(); i++) {
861 int processToAssign = (i+1) % processors;
862 if (processToAssign == 0) { processToAssign = processors; }
864 dividedNames[(processToAssign-1)].push_back(distName[i]);
865 if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
868 //not lets reverse the order of ever other process, so we balance big files running with little ones
869 for (int i = 0; i < processors; i++) {
871 int remainder = ((i+1) % processors);
872 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
875 if (m->control_pressed) { return listFiles; }
877 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
881 //loop through and create all the processes you want
882 while (process != processors) {
886 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
890 vector<string> listFileNames = cluster(dividedNames[process], labels);
892 //write out names to file
893 string filename = toString(getpid()) + ".temp";
895 m->openOutputFile(filename, out);
897 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
902 filename = toString(getpid()) + ".temp.labels";
903 m->openOutputFile(filename, outLabels);
905 outLabels << cutoff << endl;
906 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
907 outLabels << (*it) << endl;
913 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
914 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
920 listFiles = cluster(dividedNames[0], labels);
922 //force parent to wait until all the processes are done
923 for (int i=0;i< processIDS.size();i++) {
924 int temp = processIDS[i];
928 //get list of list file names from each process
929 for(int i=0;i<processIDS.size();i++){
930 string filename = toString(processIDS[i]) + ".temp";
932 m->openInputFile(filename, in);
934 in >> tag; m->gobble(in);
938 in >> tempName; m->gobble(in);
939 listFiles.push_back(tempName);
942 m->mothurRemove((toString(processIDS[i]) + ".temp"));
945 filename = toString(processIDS[i]) + ".temp.labels";
947 m->openInputFile(filename, in2);
950 in2 >> tempCutoff; m->gobble(in2);
951 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
955 in2 >> tempName; m->gobble(in2);
956 if (labels.count(tempName) == 0) { labels.insert(tempName); }
959 m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
965 //////////////////////////////////////////////////////////////////////////////////////////////////////
966 //Windows version shared memory, so be careful when passing variables through the clusterData struct.
967 //Above fork() will clone, so memory is separate, but that's not the case with windows,
968 //Taking advantage of shared memory to allow both threads to add labels.
969 //////////////////////////////////////////////////////////////////////////////////////////////////////
971 vector<clusterData*> pDataArray;
972 DWORD dwThreadIdArray[processors-1];
973 HANDLE hThreadArray[processors-1];
975 //Create processor worker threads.
976 for( int i=1; i<processors; i++ ){
977 // Allocate memory for thread data.
978 clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
979 pDataArray.push_back(tempCluster);
980 processIDS.push_back(i);
982 //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
983 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
984 hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
989 listFiles = cluster(dividedNames[0], labels);
991 //Wait until all threads have terminated.
992 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
994 //Close all thread handles and free memory allocations.
995 for(int i=0; i < pDataArray.size(); i++){
997 tag = pDataArray[i]->tag;
998 //get listfiles created
999 for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
1001 set<string>::iterator it;
1002 for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
1004 if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
1005 CloseHandle(hThreadArray[i]);
1006 delete pDataArray[i];
1014 catch(exception& e) {
1015 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1019 //**********************************************************************************************************************
1021 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
1024 vector<string> listFileNames;
1025 double smallestCutoff = cutoff;
1027 //cluster each distance file
1028 for (int i = 0; i < distNames.size(); i++) {
1030 string thisNamefile = distNames[i].begin()->second;
1031 string thisDistFile = distNames[i].begin()->first;
1033 string listFileName = "";
1034 if (classic) { listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
1035 else { listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
1037 if (m->control_pressed) { //clean up
1038 for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); }
1039 listFileNames.clear(); return listFileNames;
1042 listFileNames.push_back(listFileName);
1045 cutoff = smallestCutoff;
1047 return listFileNames;
1050 catch(exception& e) {
1051 m->errorOut(e, "ClusterSplitCommand", "cluster");
1057 //**********************************************************************************************************************
1058 string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1060 string listFileName = "";
1062 ListVector* list = NULL;
1064 RAbundVector* rabund = NULL;
1068 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1070 //output your files too
1072 cout << endl << "Reading " << thisDistFile << endl;
1076 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1078 NameAssignment* nameMap = new NameAssignment(thisNamefile);
1081 //reads phylip file storing data in 2D vector, also fills list and rabund
1083 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
1084 cluster->readPhylipFile(thisDistFile, nameMap);
1085 tag = cluster->getTag();
1087 if (m->control_pressed) { delete cluster; return 0; }
1089 list = cluster->getListVector();
1090 rabund = cluster->getRAbundVector();
1092 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1093 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1094 listFileName = fileroot+ tag + ".list";
1097 m->openOutputFile(fileroot+ tag + ".list", listFile);
1099 float previousDist = 0.00000;
1100 float rndPreviousDist = 0.00000;
1104 //output your files too
1106 cout << endl << "Clustering " << thisDistFile << endl;
1110 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1112 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
1113 if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close(); return listFileName; }
1115 cluster->update(cutoff);
1117 float dist = cluster->getSmallDist();
1120 rndDist = m->ceilDist(dist, precision);
1122 rndDist = m->roundDist(dist, precision);
1125 if(previousDist <= 0.0000 && dist != previousDist){
1126 oldList.setLabel("unique");
1127 oldList.print(listFile);
1128 if (labels.count("unique") == 0) { labels.insert("unique"); }
1130 else if(rndDist != rndPreviousDist){
1131 oldList.setLabel(toString(rndPreviousDist, length-1));
1132 oldList.print(listFile);
1133 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1137 previousDist = dist;
1138 rndPreviousDist = rndDist;
1142 if(previousDist <= 0.0000){
1143 oldList.setLabel("unique");
1144 oldList.print(listFile);
1145 if (labels.count("unique") == 0) { labels.insert("unique"); }
1147 else if(rndPreviousDist<cutoff){
1148 oldList.setLabel(toString(rndPreviousDist, length-1));
1149 oldList.print(listFile);
1150 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1156 delete cluster; delete nameMap; delete list; delete rabund;
1159 return listFileName;
1162 catch(exception& e) {
1163 m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
1168 //**********************************************************************************************************************
1169 string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
1171 string listFileName = "";
1173 Cluster* cluster = NULL;
1174 SparseMatrix* matrix = NULL;
1175 ListVector* list = NULL;
1177 RAbundVector* rabund = NULL;
1179 if (m->control_pressed) { return listFileName; }
1183 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1185 //output your files too
1187 cout << endl << "Reading " << thisDistFile << endl;
1191 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
1193 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1194 read->setCutoff(cutoff);
1196 NameAssignment* nameMap = new NameAssignment(thisNamefile);
1198 read->read(nameMap);
1200 if (m->control_pressed) { delete read; delete nameMap; return listFileName; }
1202 list = read->getListVector();
1204 matrix = read->getMatrix();
1206 delete read; read = NULL;
1207 delete nameMap; nameMap = NULL;
1211 //output your files too
1213 cout << endl << "Clustering " << thisDistFile << endl;
1217 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
1219 rabund = new RAbundVector(list->getRAbundVector());
1222 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
1223 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1224 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
1225 tag = cluster->getTag();
1227 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1228 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1231 m->openOutputFile(fileroot+ tag + ".list", listFile);
1233 listFileName = fileroot+ tag + ".list";
1235 float previousDist = 0.00000;
1236 float rndPreviousDist = 0.00000;
1242 double saveCutoff = cutoff;
1244 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1246 if (m->control_pressed) { //clean up
1247 delete matrix; delete list; delete cluster; delete rabund;
1249 m->mothurRemove(listFileName);
1250 return listFileName;
1253 cluster->update(saveCutoff);
1255 float dist = matrix->getSmallDist();
1258 rndDist = m->ceilDist(dist, precision);
1260 rndDist = m->roundDist(dist, precision);
1263 if(previousDist <= 0.0000 && dist != previousDist){
1264 oldList.setLabel("unique");
1265 oldList.print(listFile);
1266 if (labels.count("unique") == 0) { labels.insert("unique"); }
1268 else if(rndDist != rndPreviousDist){
1269 oldList.setLabel(toString(rndPreviousDist, length-1));
1270 oldList.print(listFile);
1271 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1274 previousDist = dist;
1275 rndPreviousDist = rndDist;
1280 if(previousDist <= 0.0000){
1281 oldList.setLabel("unique");
1282 oldList.print(listFile);
1283 if (labels.count("unique") == 0) { labels.insert("unique"); }
1285 else if(rndPreviousDist<cutoff){
1286 oldList.setLabel(toString(rndPreviousDist, length-1));
1287 oldList.print(listFile);
1288 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1291 delete matrix; delete list; delete cluster; delete rabund;
1292 matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1295 if (m->control_pressed) { //clean up
1296 m->mothurRemove(listFileName);
1297 return listFileName;
1300 m->mothurRemove(thisDistFile);
1301 m->mothurRemove(thisNamefile);
1303 if (saveCutoff != cutoff) {
1304 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1305 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1307 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1310 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1312 return listFileName;
1315 catch(exception& e) {
1316 m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1320 //**********************************************************************************************************************
1322 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1327 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1332 string thisOutputDir = outputDir;
1333 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1334 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
1335 m->mothurRemove(outputFileName);
1338 for (int i = 0; i < distNames.size(); i++) {
1339 if (m->control_pressed) { return 0; }
1341 string thisDistFile = distNames[i].begin()->first;
1343 m->appendFiles(thisDistFile, outputFileName);
1346 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1356 catch(exception& e) {
1357 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1361 //**********************************************************************************************************************