2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
19 //**********************************************************************************************************************
20 vector<string> ClusterSplitCommand::getValidParameters(){
22 string AlignArray[] = {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
23 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
27 m->errorOut(e, "ClusterSplitCommand", "getValidParameters");
31 //**********************************************************************************************************************
32 ClusterSplitCommand::ClusterSplitCommand(){
35 //initialize outputTypes
36 vector<string> tempOutNames;
37 outputTypes["list"] = tempOutNames;
38 outputTypes["rabund"] = tempOutNames;
39 outputTypes["sabund"] = tempOutNames;
40 outputTypes["column"] = tempOutNames;
43 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
47 //**********************************************************************************************************************
48 vector<string> ClusterSplitCommand::getRequiredParameters(){
50 string Array[] = {"fasta","phylip","column","or"};
51 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
55 m->errorOut(e, "ClusterSplitCommand", "getRequiredParameters");
59 //**********************************************************************************************************************
60 vector<string> ClusterSplitCommand::getRequiredFiles(){
62 vector<string> myArray;
66 m->errorOut(e, "ClusterSplitCommand", "getRequiredFiles");
70 //**********************************************************************************************************************
71 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
72 ClusterSplitCommand::ClusterSplitCommand(string option) {
74 globaldata = GlobalData::getInstance();
78 //allow user to run help
79 if(option == "help") { help(); abort = true; }
82 //valid paramters for this command
83 string Array[] = {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
84 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
86 OptionParser parser(option);
87 map<string,string> parameters = parser.getParameters();
89 ValidParameters validParameter("cluster.split");
91 //check to make sure all parameters are valid for command
92 map<string,string>::iterator it;
93 for (it = parameters.begin(); it != parameters.end(); it++) {
94 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
99 //initialize outputTypes
100 vector<string> tempOutNames;
101 outputTypes["list"] = tempOutNames;
102 outputTypes["rabund"] = tempOutNames;
103 outputTypes["sabund"] = tempOutNames;
104 outputTypes["column"] = tempOutNames;
106 globaldata->newRead();
108 //if the user changes the output directory command factory will send this info to us in the output parameter
109 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
111 //if the user changes the input directory command factory will send this info to us in the output parameter
112 string inputDir = validParameter.validFile(parameters, "inputdir", false);
113 if (inputDir == "not found"){ inputDir = ""; }
116 it = parameters.find("phylip");
117 //user has given a template file
118 if(it != parameters.end()){
119 path = m->hasPath(it->second);
120 //if the user has not given a path then, add inputdir. else leave path alone.
121 if (path == "") { parameters["phylip"] = inputDir + it->second; }
124 it = parameters.find("column");
125 //user has given a template file
126 if(it != parameters.end()){
127 path = m->hasPath(it->second);
128 //if the user has not given a path then, add inputdir. else leave path alone.
129 if (path == "") { parameters["column"] = inputDir + it->second; }
132 it = parameters.find("name");
133 //user has given a template file
134 if(it != parameters.end()){
135 path = m->hasPath(it->second);
136 //if the user has not given a path then, add inputdir. else leave path alone.
137 if (path == "") { parameters["name"] = inputDir + it->second; }
140 it = parameters.find("taxonomy");
141 //user has given a template file
142 if(it != parameters.end()){
143 path = m->hasPath(it->second);
144 //if the user has not given a path then, add inputdir. else leave path alone.
145 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
148 it = parameters.find("fasta");
149 //user has given a template file
150 if(it != parameters.end()){
151 path = m->hasPath(it->second);
152 //if the user has not given a path then, add inputdir. else leave path alone.
153 if (path == "") { parameters["fasta"] = inputDir + it->second; }
157 //check for required parameters
158 phylipfile = validParameter.validFile(parameters, "phylip", true);
159 if (phylipfile == "not open") { abort = true; }
160 else if (phylipfile == "not found") { phylipfile = ""; }
161 else { distfile = phylipfile; format = "phylip"; }
163 columnfile = validParameter.validFile(parameters, "column", true);
164 if (columnfile == "not open") { abort = true; }
165 else if (columnfile == "not found") { columnfile = ""; }
166 else { distfile = columnfile; format = "column"; }
168 namefile = validParameter.validFile(parameters, "name", true);
169 if (namefile == "not open") { abort = true; }
170 else if (namefile == "not found") { namefile = ""; }
172 fastafile = validParameter.validFile(parameters, "fasta", true);
173 if (fastafile == "not open") { abort = true; }
174 else if (fastafile == "not found") { fastafile = ""; }
175 else { distfile = fastafile; splitmethod = "fasta"; }
177 taxFile = validParameter.validFile(parameters, "taxonomy", true);
178 if (taxFile == "not open") { abort = true; }
179 else if (taxFile == "not found") { taxFile = ""; }
181 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
182 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
184 if (columnfile != "") {
185 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
188 if (fastafile != "") {
189 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
190 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
193 //check for optional parameter and set defaults
194 // ...at some point should added some additional type checking...
195 //get user cutoff and precision or use defaults
197 temp = validParameter.validFile(parameters, "precision", false);
198 if (temp == "not found") { temp = "100"; }
199 //saves precision legnth for formatting below
200 length = temp.length();
201 convert(temp, precision);
203 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
204 hard = m->isTrue(temp);
206 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
207 large = m->isTrue(temp);
209 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
210 convert(temp, processors);
212 temp = validParameter.validFile(parameters, "splitmethod", false);
213 if (splitmethod != "fasta") {
214 if (temp == "not found") { splitmethod = "distance"; }
215 else { splitmethod = temp; }
218 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
219 convert(temp, cutoff);
220 cutoff += (5 / (precision * 10.0));
222 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "1"; }
223 convert(temp, taxLevelCutoff);
225 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; }
227 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
228 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
230 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
231 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
233 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
235 showabund = validParameter.validFile(parameters, "showabund", false);
236 if (showabund == "not found") { showabund = "T"; }
238 timing = validParameter.validFile(parameters, "timing", false);
239 if (timing == "not found") { timing = "F"; }
243 catch(exception& e) {
244 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
249 //**********************************************************************************************************************
251 void ClusterSplitCommand::help(){
253 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
254 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
255 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
256 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n");
257 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n");
258 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n");
259 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
260 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
261 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
262 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
263 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
264 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
265 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
266 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
267 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
268 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n");
269 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
271 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
273 m->mothurOut("The cluster.split command should be in the following format: \n");
274 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
275 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
278 catch(exception& e) {
279 m->errorOut(e, "ClusterSplitCommand", "help");
284 //**********************************************************************************************************************
286 ClusterSplitCommand::~ClusterSplitCommand(){}
288 //**********************************************************************************************************************
290 int ClusterSplitCommand::execute(){
293 if (abort == true) { return 0; }
296 vector<string> listFileNames;
298 string singletonName = "";
299 double saveCutoff = cutoff;
301 //****************** file prep work ******************************//
306 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
307 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
309 if (pid == 0) { //only process 0 converts and splits
313 //if user gave a phylip file convert to column file
314 if (format == "phylip") {
316 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
318 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
320 NameAssignment* nameMap = NULL;
321 convert->setFormat("phylip");
322 convert->read(nameMap);
324 if (m->control_pressed) { delete convert; return 0; }
326 distfile = convert->getOutputFile();
328 //if no names file given with phylip file, create it
329 ListVector* listToMakeNameFile = convert->getListVector();
330 if (namefile == "") { //you need to make a namefile for split matrix
332 namefile = phylipfile + ".names";
333 m->openOutputFile(namefile, out);
334 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
335 string bin = listToMakeNameFile->get(i);
336 out << bin << '\t' << bin << endl;
340 delete listToMakeNameFile;
343 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
345 if (m->control_pressed) { return 0; }
348 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
350 //split matrix into non-overlapping groups
352 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); }
353 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); }
354 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir); }
355 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
359 if (m->control_pressed) { delete split; return 0; }
361 singletonName = split->getSingletonNames();
362 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
365 //output a merged distance file
366 if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
369 if (m->control_pressed) { return 0; }
371 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
374 //****************** break up files between processes and cluster each file set ******************************//
376 ////you are process 0 from above////
378 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
379 dividedNames.resize(processors);
381 //for each file group figure out which process will complete it
382 //want to divide the load intelligently so the big files are spread between processes
383 for (int i = 0; i < distName.size(); i++) {
384 int processToAssign = (i+1) % processors;
385 if (processToAssign == 0) { processToAssign = processors; }
387 dividedNames[(processToAssign-1)].push_back(distName[i]);
390 //not lets reverse the order of ever other process, so we balance big files running with little ones
391 for (int i = 0; i < processors; i++) {
392 int remainder = ((i+1) % processors);
393 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
397 //send each child the list of files it needs to process
398 for(int i = 1; i < processors; i++) {
399 //send number of file pairs
400 int num = dividedNames[i].size();
401 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
403 for (int j = 0; j < num; j++) { //send filenames to process i
404 char tempDistFileName[1024];
405 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
406 int lengthDist = (dividedNames[i][j].begin()->first).length();
408 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
409 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
411 char tempNameFileName[1024];
412 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
413 int lengthName = (dividedNames[i][j].begin()->second).length();
415 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
416 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
421 listFileNames = cluster(dividedNames[0], labels);
423 //receive the other processes info
424 for(int i = 1; i < processors; i++) {
425 int num = dividedNames[i].size();
428 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
429 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
431 //send list filenames to root process
432 for (int j = 0; j < num; j++) {
434 char tempListFileName[1024];
436 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
437 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
439 string myListFileName = tempListFileName;
440 myListFileName = myListFileName.substr(0, lengthList);
442 listFileNames.push_back(myListFileName);
445 //send Labels to root process
447 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
449 for (int j = 0; j < numLabels; j++) {
453 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
454 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
456 string myLabel = tempLabel;
457 myLabel = myLabel.substr(0, lengthLabel);
459 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
463 }else { //you are a child process
464 vector < map<string, string> > myNames;
466 //recieve the files you need to process
467 //receive number of file pairs
469 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
473 for (int j = 0; j < num; j++) { //receive filenames to process
475 char tempDistFileName[1024];
477 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
478 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
480 string myDistFileName = tempDistFileName;
481 myDistFileName = myDistFileName.substr(0, lengthDist);
484 char tempNameFileName[1024];
486 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
487 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
489 string myNameFileName = tempNameFileName;
490 myNameFileName = myNameFileName.substr(0, lengthName);
493 myNames[j][myDistFileName] = myNameFileName;
497 listFileNames = cluster(myNames, labels);
500 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
502 //send list filenames to root process
503 for (int j = 0; j < num; j++) {
504 char tempListFileName[1024];
505 strcpy(tempListFileName, listFileNames[j].c_str());
506 int lengthList = listFileNames[j].length();
508 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
509 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
512 //send Labels to root process
513 int numLabels = labels.size();
514 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
516 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
518 strcpy(tempLabel, (*it).c_str());
519 int lengthLabel = (*it).length();
521 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
522 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
527 MPI_Barrier(MPI_COMM_WORLD);
531 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
533 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
535 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
536 dividedNames.resize(processors);
538 //for each file group figure out which process will complete it
539 //want to divide the load intelligently so the big files are spread between processes
540 for (int i = 0; i < distName.size(); i++) {
541 int processToAssign = (i+1) % processors;
542 if (processToAssign == 0) { processToAssign = processors; }
544 dividedNames[(processToAssign-1)].push_back(distName[i]);
547 //not lets reverse the order of ever other process, so we balance big files running with little ones
548 for (int i = 0; i < processors; i++) {
549 int remainder = ((i+1) % processors);
550 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
553 createProcesses(dividedNames);
555 if (m->control_pressed) { return 0; }
557 //get list of list file names from each process
558 for(int i=0;i<processors;i++){
559 string filename = toString(processIDS[i]) + ".temp";
561 m->openInputFile(filename, in);
563 in >> tag; m->gobble(in);
567 in >> tempName; m->gobble(in);
568 listFileNames.push_back(tempName);
571 remove((toString(processIDS[i]) + ".temp").c_str());
574 filename = toString(processIDS[i]) + ".temp.labels";
576 m->openInputFile(filename, in2);
579 in2 >> tempCutoff; m->gobble(in2);
580 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
584 in2 >> tempName; m->gobble(in2);
585 if (labels.count(tempName) == 0) { labels.insert(tempName); }
588 remove((toString(processIDS[i]) + ".temp.labels").c_str());
592 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
595 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
597 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
599 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
601 //****************** merge list file and create rabund and sabund files ******************************//
603 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
606 if (pid == 0) { //only process 0 merges
609 ListVector* listSingle;
610 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
612 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
614 mergeLists(listFileNames, labelBins, listSingle);
616 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
618 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
620 m->mothurOutEndLine();
621 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
622 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
623 m->mothurOutEndLine();
626 } //only process 0 merges
629 MPI_Barrier(MPI_COMM_WORLD);
634 catch(exception& e) {
635 m->errorOut(e, "ClusterSplitCommand", "execute");
639 //**********************************************************************************************************************
640 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
643 map<float, int> labelBin;
644 vector<float> orderFloat;
648 if (singleton != "none") {
650 m->openInputFile(singleton, in);
652 string firstCol, secondCol;
653 listSingle = new ListVector();
655 in >> firstCol >> secondCol; m->gobble(in);
656 listSingle->push_back(secondCol);
659 remove(singleton.c_str());
661 numSingleBins = listSingle->getNumBins();
662 }else{ listSingle = NULL; numSingleBins = 0; }
664 //go through users set and make them floats so we can sort them
665 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
668 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
669 else if (*it == "unique") { temp = -1.0; }
671 if (temp <= cutoff) {
672 orderFloat.push_back(temp);
673 labelBin[temp] = numSingleBins; //initialize numbins
678 sort(orderFloat.begin(), orderFloat.end());
681 //get the list info from each file
682 for (int k = 0; k < listNames.size(); k++) {
684 if (m->control_pressed) {
685 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str()); }
686 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
690 InputData* input = new InputData(listNames[k], "list");
691 ListVector* list = input->getListVector();
692 string lastLabel = list->getLabel();
694 string filledInList = listNames[k] + "filledInTemp";
696 m->openOutputFile(filledInList, outFilled);
698 //for each label needed
699 for(int l = 0; l < orderFloat.size(); l++){
702 if (orderFloat[l] == -1) { thisLabel = "unique"; }
703 else { thisLabel = toString(orderFloat[l], length-1); }
705 //this file has reached the end
707 list = input->getListVector(lastLabel, true);
708 }else{ //do you have the distance, or do you need to fill in
711 if (list->getLabel() == "unique") { labelFloat = -1.0; }
712 else { convert(list->getLabel(), labelFloat); }
714 //check for missing labels
715 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
716 //if its bigger get last label, otherwise keep it
718 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
720 lastLabel = list->getLabel();
724 list->setLabel(thisLabel);
725 list->print(outFilled);
728 labelBin[orderFloat[l]] += list->getNumBins();
732 list = input->getListVector();
735 if (list != NULL) { delete list; }
739 remove(listNames[k].c_str());
740 rename(filledInList.c_str(), listNames[k].c_str());
745 catch(exception& e) {
746 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
750 //**********************************************************************************************************************
751 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
753 if (outputDir == "") { outputDir += m->hasPath(distfile); }
754 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
756 m->openOutputFile(fileroot+ tag + ".sabund", outSabund);
757 m->openOutputFile(fileroot+ tag + ".rabund", outRabund);
758 m->openOutputFile(fileroot+ tag + ".list", outList);
760 outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["list"].push_back(fileroot+ tag + ".list");
761 outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
762 outputNames.push_back(fileroot+ tag + ".list"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
764 map<float, int>::iterator itLabel;
766 //for each label needed
767 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
770 if (itLabel->first == -1) { thisLabel = "unique"; }
771 else { thisLabel = toString(itLabel->first, length-1); }
773 outList << thisLabel << '\t' << itLabel->second << '\t';
775 RAbundVector* rabund = new RAbundVector();
776 rabund->setLabel(thisLabel);
779 if (listSingle != NULL) {
780 for (int j = 0; j < listSingle->getNumBins(); j++) {
781 outList << listSingle->get(j) << '\t';
782 rabund->push_back(m->getNumNames(listSingle->get(j)));
786 //get the list info from each file
787 for (int k = 0; k < listNames.size(); k++) {
789 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); } delete rabund; return 0; }
791 InputData* input = new InputData(listNames[k], "list");
792 ListVector* list = input->getListVector(thisLabel);
794 //this file has reached the end
795 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
797 for (int j = 0; j < list->getNumBins(); j++) {
798 outList << list->get(j) << '\t';
799 rabund->push_back(m->getNumNames(list->get(j)));
806 SAbundVector sabund = rabund->getSAbundVector();
808 sabund.print(outSabund);
809 rabund->print(outRabund);
819 if (listSingle != NULL) { delete listSingle; }
821 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
825 catch(exception& e) {
826 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
831 //**********************************************************************************************************************
833 void ClusterSplitCommand::printData(ListVector* oldList){
835 string label = oldList->getLabel();
836 RAbundVector oldRAbund = oldList->getRAbundVector();
838 oldRAbund.setLabel(label);
839 if (m->isTrue(showabund)) {
840 oldRAbund.getSAbundVector().print(cout);
842 oldRAbund.print(outRabund);
843 oldRAbund.getSAbundVector().print(outSabund);
845 oldList->print(outList);
847 catch(exception& e) {
848 m->errorOut(e, "ClusterSplitCommand", "printData");
852 //**********************************************************************************************************************
853 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
856 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
861 //loop through and create all the processes you want
862 while (process != processors) {
866 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
870 vector<string> listFileNames = cluster(dividedNames[process], labels);
872 //write out names to file
873 string filename = toString(getpid()) + ".temp";
875 m->openOutputFile(filename, out);
877 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
882 filename = toString(getpid()) + ".temp.labels";
883 m->openOutputFile(filename, outLabels);
885 outLabels << cutoff << endl;
886 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
887 outLabels << (*it) << endl;
893 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
894 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
899 //force parent to wait until all the processes are done
900 for (int i=0;i<processors;i++) {
901 int temp = processIDS[i];
909 catch(exception& e) {
910 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
914 //**********************************************************************************************************************
916 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
919 SparseMatrix* matrix;
922 RAbundVector* rabund;
924 vector<string> listFileNames;
926 double smallestCutoff = cutoff;
928 //cluster each distance file
929 for (int i = 0; i < distNames.size(); i++) {
930 if (m->control_pressed) { return listFileNames; }
932 string thisNamefile = distNames[i].begin()->second;
933 string thisDistFile = distNames[i].begin()->first;
935 //read in distance file
936 globaldata->setNameFile(thisNamefile);
937 globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
941 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
943 //output your files too
945 cout << endl << "Reading " << thisDistFile << endl;
949 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
951 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
952 read->setCutoff(cutoff);
954 NameAssignment* nameMap = new NameAssignment(thisNamefile);
958 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
960 list = read->getListVector();
962 matrix = read->getMatrix();
969 //output your files too
971 cout << endl << "Clustering " << thisDistFile << endl;
975 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
977 rabund = new RAbundVector(list->getRAbundVector());
980 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
981 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
982 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
983 tag = cluster->getTag();
985 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
986 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
989 m->openOutputFile(fileroot+ tag + ".list", listFile);
991 listFileNames.push_back(fileroot+ tag + ".list");
993 float previousDist = 0.00000;
994 float rndPreviousDist = 0.00000;
1000 double saveCutoff = cutoff;
1002 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1004 if (m->control_pressed) { //clean up
1005 delete matrix; delete list; delete cluster; delete rabund;
1007 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1008 listFileNames.clear(); return listFileNames;
1011 cluster->update(saveCutoff);
1013 float dist = matrix->getSmallDist();
1016 rndDist = m->ceilDist(dist, precision);
1018 rndDist = m->roundDist(dist, precision);
1021 if(previousDist <= 0.0000 && dist != previousDist){
1022 oldList.setLabel("unique");
1023 oldList.print(listFile);
1024 if (labels.count("unique") == 0) { labels.insert("unique"); }
1026 else if(rndDist != rndPreviousDist){
1027 oldList.setLabel(toString(rndPreviousDist, length-1));
1028 oldList.print(listFile);
1029 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1032 previousDist = dist;
1033 rndPreviousDist = rndDist;
1038 if(previousDist <= 0.0000){
1039 oldList.setLabel("unique");
1040 oldList.print(listFile);
1041 if (labels.count("unique") == 0) { labels.insert("unique"); }
1043 else if(rndPreviousDist<cutoff){
1044 oldList.setLabel(toString(rndPreviousDist, length-1));
1045 oldList.print(listFile);
1046 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1049 delete matrix; delete list; delete cluster; delete rabund;
1052 if (m->control_pressed) { //clean up
1053 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1054 listFileNames.clear(); return listFileNames;
1057 remove(thisDistFile.c_str());
1058 remove(thisNamefile.c_str());
1060 if (saveCutoff != cutoff) {
1061 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1062 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1064 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1067 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1070 cutoff = smallestCutoff;
1072 return listFileNames;
1075 catch(exception& e) {
1076 m->errorOut(e, "ClusterSplitCommand", "cluster");
1082 //**********************************************************************************************************************
1084 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1089 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1094 string thisOutputDir = outputDir;
1095 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1096 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
1097 remove(outputFileName.c_str());
1100 for (int i = 0; i < distNames.size(); i++) {
1101 if (m->control_pressed) { return 0; }
1103 string thisDistFile = distNames[i].begin()->first;
1105 m->appendFiles(thisDistFile, outputFileName);
1108 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1118 catch(exception& e) {
1119 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1123 //**********************************************************************************************************************