2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
19 //**********************************************************************************************************************
20 vector<string> ClusterSplitCommand::getValidParameters(){
22 string AlignArray[] = {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
23 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
27 m->errorOut(e, "ClusterSplitCommand", "getValidParameters");
31 //**********************************************************************************************************************
32 ClusterSplitCommand::ClusterSplitCommand(){
34 abort = true; calledHelp = true;
35 vector<string> tempOutNames;
36 outputTypes["list"] = tempOutNames;
37 outputTypes["rabund"] = tempOutNames;
38 outputTypes["sabund"] = tempOutNames;
39 outputTypes["column"] = tempOutNames;
42 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
46 //**********************************************************************************************************************
47 vector<string> ClusterSplitCommand::getRequiredParameters(){
49 string Array[] = {"fasta","phylip","column","or"};
50 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
54 m->errorOut(e, "ClusterSplitCommand", "getRequiredParameters");
58 //**********************************************************************************************************************
59 vector<string> ClusterSplitCommand::getRequiredFiles(){
61 vector<string> myArray;
65 m->errorOut(e, "ClusterSplitCommand", "getRequiredFiles");
69 //**********************************************************************************************************************
70 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
71 ClusterSplitCommand::ClusterSplitCommand(string option) {
73 globaldata = GlobalData::getInstance();
74 abort = false; calledHelp = false;
77 //allow user to run help
78 if(option == "help") { help(); abort = true; calledHelp = true; }
81 //valid paramters for this command
82 string Array[] = {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
83 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
85 OptionParser parser(option);
86 map<string,string> parameters = parser.getParameters();
88 ValidParameters validParameter("cluster.split");
90 //check to make sure all parameters are valid for command
91 map<string,string>::iterator it;
92 for (it = parameters.begin(); it != parameters.end(); it++) {
93 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
98 //initialize outputTypes
99 vector<string> tempOutNames;
100 outputTypes["list"] = tempOutNames;
101 outputTypes["rabund"] = tempOutNames;
102 outputTypes["sabund"] = tempOutNames;
103 outputTypes["column"] = tempOutNames;
105 globaldata->newRead();
107 //if the user changes the output directory command factory will send this info to us in the output parameter
108 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
110 //if the user changes the input directory command factory will send this info to us in the output parameter
111 string inputDir = validParameter.validFile(parameters, "inputdir", false);
112 if (inputDir == "not found"){ inputDir = ""; }
115 it = parameters.find("phylip");
116 //user has given a template file
117 if(it != parameters.end()){
118 path = m->hasPath(it->second);
119 //if the user has not given a path then, add inputdir. else leave path alone.
120 if (path == "") { parameters["phylip"] = inputDir + it->second; }
123 it = parameters.find("column");
124 //user has given a template file
125 if(it != parameters.end()){
126 path = m->hasPath(it->second);
127 //if the user has not given a path then, add inputdir. else leave path alone.
128 if (path == "") { parameters["column"] = inputDir + it->second; }
131 it = parameters.find("name");
132 //user has given a template file
133 if(it != parameters.end()){
134 path = m->hasPath(it->second);
135 //if the user has not given a path then, add inputdir. else leave path alone.
136 if (path == "") { parameters["name"] = inputDir + it->second; }
139 it = parameters.find("taxonomy");
140 //user has given a template file
141 if(it != parameters.end()){
142 path = m->hasPath(it->second);
143 //if the user has not given a path then, add inputdir. else leave path alone.
144 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
147 it = parameters.find("fasta");
148 //user has given a template file
149 if(it != parameters.end()){
150 path = m->hasPath(it->second);
151 //if the user has not given a path then, add inputdir. else leave path alone.
152 if (path == "") { parameters["fasta"] = inputDir + it->second; }
156 //check for required parameters
157 phylipfile = validParameter.validFile(parameters, "phylip", true);
158 if (phylipfile == "not open") { abort = true; }
159 else if (phylipfile == "not found") { phylipfile = ""; }
160 else { distfile = phylipfile; format = "phylip"; }
162 columnfile = validParameter.validFile(parameters, "column", true);
163 if (columnfile == "not open") { abort = true; }
164 else if (columnfile == "not found") { columnfile = ""; }
165 else { distfile = columnfile; format = "column"; }
167 namefile = validParameter.validFile(parameters, "name", true);
168 if (namefile == "not open") { abort = true; }
169 else if (namefile == "not found") { namefile = ""; }
171 fastafile = validParameter.validFile(parameters, "fasta", true);
172 if (fastafile == "not open") { abort = true; }
173 else if (fastafile == "not found") { fastafile = ""; }
174 else { distfile = fastafile; splitmethod = "fasta"; }
176 taxFile = validParameter.validFile(parameters, "taxonomy", true);
177 if (taxFile == "not open") { abort = true; }
178 else if (taxFile == "not found") { taxFile = ""; }
180 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
181 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
183 if (columnfile != "") {
184 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
187 if (fastafile != "") {
188 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
189 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
192 //check for optional parameter and set defaults
193 // ...at some point should added some additional type checking...
194 //get user cutoff and precision or use defaults
196 temp = validParameter.validFile(parameters, "precision", false);
197 if (temp == "not found") { temp = "100"; }
198 //saves precision legnth for formatting below
199 length = temp.length();
200 convert(temp, precision);
202 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
203 hard = m->isTrue(temp);
205 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
206 large = m->isTrue(temp);
208 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
209 convert(temp, processors);
211 temp = validParameter.validFile(parameters, "splitmethod", false);
212 if (splitmethod != "fasta") {
213 if (temp == "not found") { splitmethod = "distance"; }
214 else { splitmethod = temp; }
217 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
218 convert(temp, cutoff);
219 cutoff += (5 / (precision * 10.0));
221 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "1"; }
222 convert(temp, taxLevelCutoff);
224 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; }
226 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
227 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
229 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
230 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
232 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
234 showabund = validParameter.validFile(parameters, "showabund", false);
235 if (showabund == "not found") { showabund = "T"; }
237 timing = validParameter.validFile(parameters, "timing", false);
238 if (timing == "not found") { timing = "F"; }
242 catch(exception& e) {
243 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
248 //**********************************************************************************************************************
250 void ClusterSplitCommand::help(){
252 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
253 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
254 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
255 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n");
256 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n");
257 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n");
258 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
259 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
260 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
261 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
262 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
263 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
264 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
265 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
266 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
267 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n");
268 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
270 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
272 m->mothurOut("The cluster.split command should be in the following format: \n");
273 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
274 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
277 catch(exception& e) {
278 m->errorOut(e, "ClusterSplitCommand", "help");
283 //**********************************************************************************************************************
285 ClusterSplitCommand::~ClusterSplitCommand(){}
287 //**********************************************************************************************************************
289 int ClusterSplitCommand::execute(){
292 if (abort == true) { if (calledHelp) { return 0; } return 2; }
295 vector<string> listFileNames;
297 string singletonName = "";
298 double saveCutoff = cutoff;
300 //****************** file prep work ******************************//
305 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
306 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
308 if (pid == 0) { //only process 0 converts and splits
312 //if user gave a phylip file convert to column file
313 if (format == "phylip") {
315 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
317 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
319 NameAssignment* nameMap = NULL;
320 convert->setFormat("phylip");
321 convert->read(nameMap);
323 if (m->control_pressed) { delete convert; return 0; }
325 distfile = convert->getOutputFile();
327 //if no names file given with phylip file, create it
328 ListVector* listToMakeNameFile = convert->getListVector();
329 if (namefile == "") { //you need to make a namefile for split matrix
331 namefile = phylipfile + ".names";
332 m->openOutputFile(namefile, out);
333 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
334 string bin = listToMakeNameFile->get(i);
335 out << bin << '\t' << bin << endl;
339 delete listToMakeNameFile;
342 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
344 if (m->control_pressed) { return 0; }
347 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
349 //split matrix into non-overlapping groups
351 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); }
352 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); }
353 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir); }
354 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
358 if (m->control_pressed) { delete split; return 0; }
360 singletonName = split->getSingletonNames();
361 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
364 //output a merged distance file
365 if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
368 if (m->control_pressed) { return 0; }
370 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
373 //****************** break up files between processes and cluster each file set ******************************//
375 ////you are process 0 from above////
377 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
378 dividedNames.resize(processors);
380 //for each file group figure out which process will complete it
381 //want to divide the load intelligently so the big files are spread between processes
382 for (int i = 0; i < distName.size(); i++) {
383 int processToAssign = (i+1) % processors;
384 if (processToAssign == 0) { processToAssign = processors; }
386 dividedNames[(processToAssign-1)].push_back(distName[i]);
389 //not lets reverse the order of ever other process, so we balance big files running with little ones
390 for (int i = 0; i < processors; i++) {
391 int remainder = ((i+1) % processors);
392 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
396 //send each child the list of files it needs to process
397 for(int i = 1; i < processors; i++) {
398 //send number of file pairs
399 int num = dividedNames[i].size();
400 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
402 for (int j = 0; j < num; j++) { //send filenames to process i
403 char tempDistFileName[1024];
404 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
405 int lengthDist = (dividedNames[i][j].begin()->first).length();
407 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
408 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
410 char tempNameFileName[1024];
411 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
412 int lengthName = (dividedNames[i][j].begin()->second).length();
414 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
415 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
420 listFileNames = cluster(dividedNames[0], labels);
422 //receive the other processes info
423 for(int i = 1; i < processors; i++) {
424 int num = dividedNames[i].size();
427 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
428 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
430 //send list filenames to root process
431 for (int j = 0; j < num; j++) {
433 char tempListFileName[1024];
435 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
436 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
438 string myListFileName = tempListFileName;
439 myListFileName = myListFileName.substr(0, lengthList);
441 listFileNames.push_back(myListFileName);
444 //send Labels to root process
446 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
448 for (int j = 0; j < numLabels; j++) {
452 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
453 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
455 string myLabel = tempLabel;
456 myLabel = myLabel.substr(0, lengthLabel);
458 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
462 }else { //you are a child process
463 vector < map<string, string> > myNames;
465 //recieve the files you need to process
466 //receive number of file pairs
468 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
472 for (int j = 0; j < num; j++) { //receive filenames to process
474 char tempDistFileName[1024];
476 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
477 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
479 string myDistFileName = tempDistFileName;
480 myDistFileName = myDistFileName.substr(0, lengthDist);
483 char tempNameFileName[1024];
485 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
486 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
488 string myNameFileName = tempNameFileName;
489 myNameFileName = myNameFileName.substr(0, lengthName);
492 myNames[j][myDistFileName] = myNameFileName;
496 listFileNames = cluster(myNames, labels);
499 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
501 //send list filenames to root process
502 for (int j = 0; j < num; j++) {
503 char tempListFileName[1024];
504 strcpy(tempListFileName, listFileNames[j].c_str());
505 int lengthList = listFileNames[j].length();
507 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
508 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
511 //send Labels to root process
512 int numLabels = labels.size();
513 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
515 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
517 strcpy(tempLabel, (*it).c_str());
518 int lengthLabel = (*it).length();
520 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
521 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
526 MPI_Barrier(MPI_COMM_WORLD);
530 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
532 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
534 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
535 dividedNames.resize(processors);
537 //for each file group figure out which process will complete it
538 //want to divide the load intelligently so the big files are spread between processes
539 for (int i = 0; i < distName.size(); i++) {
540 int processToAssign = (i+1) % processors;
541 if (processToAssign == 0) { processToAssign = processors; }
543 dividedNames[(processToAssign-1)].push_back(distName[i]);
546 //not lets reverse the order of ever other process, so we balance big files running with little ones
547 for (int i = 0; i < processors; i++) {
548 int remainder = ((i+1) % processors);
549 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
552 createProcesses(dividedNames);
554 if (m->control_pressed) { return 0; }
556 //get list of list file names from each process
557 for(int i=0;i<processors;i++){
558 string filename = toString(processIDS[i]) + ".temp";
560 m->openInputFile(filename, in);
562 in >> tag; m->gobble(in);
566 in >> tempName; m->gobble(in);
567 listFileNames.push_back(tempName);
570 remove((toString(processIDS[i]) + ".temp").c_str());
573 filename = toString(processIDS[i]) + ".temp.labels";
575 m->openInputFile(filename, in2);
578 in2 >> tempCutoff; m->gobble(in2);
579 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
583 in2 >> tempName; m->gobble(in2);
584 if (labels.count(tempName) == 0) { labels.insert(tempName); }
587 remove((toString(processIDS[i]) + ".temp.labels").c_str());
591 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
594 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
596 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
598 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
600 //****************** merge list file and create rabund and sabund files ******************************//
602 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
605 if (pid == 0) { //only process 0 merges
608 ListVector* listSingle;
609 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
611 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
613 mergeLists(listFileNames, labelBins, listSingle);
615 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
617 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
619 //set list file as new current listfile
621 itTypes = outputTypes.find("list");
622 if (itTypes != outputTypes.end()) {
623 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
626 //set rabund file as new current rabundfile
627 itTypes = outputTypes.find("rabund");
628 if (itTypes != outputTypes.end()) {
629 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
632 //set sabund file as new current sabundfile
633 itTypes = outputTypes.find("sabund");
634 if (itTypes != outputTypes.end()) {
635 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
638 m->mothurOutEndLine();
639 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
640 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
641 m->mothurOutEndLine();
644 } //only process 0 merges
647 MPI_Barrier(MPI_COMM_WORLD);
652 catch(exception& e) {
653 m->errorOut(e, "ClusterSplitCommand", "execute");
657 //**********************************************************************************************************************
658 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
661 map<float, int> labelBin;
662 vector<float> orderFloat;
666 if (singleton != "none") {
668 m->openInputFile(singleton, in);
670 string firstCol, secondCol;
671 listSingle = new ListVector();
673 in >> firstCol >> secondCol; m->gobble(in);
674 listSingle->push_back(secondCol);
677 remove(singleton.c_str());
679 numSingleBins = listSingle->getNumBins();
680 }else{ listSingle = NULL; numSingleBins = 0; }
682 //go through users set and make them floats so we can sort them
683 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
686 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
687 else if (*it == "unique") { temp = -1.0; }
689 if (temp <= cutoff) {
690 orderFloat.push_back(temp);
691 labelBin[temp] = numSingleBins; //initialize numbins
696 sort(orderFloat.begin(), orderFloat.end());
699 //get the list info from each file
700 for (int k = 0; k < listNames.size(); k++) {
702 if (m->control_pressed) {
703 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str()); }
704 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
708 InputData* input = new InputData(listNames[k], "list");
709 ListVector* list = input->getListVector();
710 string lastLabel = list->getLabel();
712 string filledInList = listNames[k] + "filledInTemp";
714 m->openOutputFile(filledInList, outFilled);
716 //for each label needed
717 for(int l = 0; l < orderFloat.size(); l++){
720 if (orderFloat[l] == -1) { thisLabel = "unique"; }
721 else { thisLabel = toString(orderFloat[l], length-1); }
723 //this file has reached the end
725 list = input->getListVector(lastLabel, true);
726 }else{ //do you have the distance, or do you need to fill in
729 if (list->getLabel() == "unique") { labelFloat = -1.0; }
730 else { convert(list->getLabel(), labelFloat); }
732 //check for missing labels
733 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
734 //if its bigger get last label, otherwise keep it
736 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
738 lastLabel = list->getLabel();
742 list->setLabel(thisLabel);
743 list->print(outFilled);
746 labelBin[orderFloat[l]] += list->getNumBins();
750 list = input->getListVector();
753 if (list != NULL) { delete list; }
757 remove(listNames[k].c_str());
758 rename(filledInList.c_str(), listNames[k].c_str());
763 catch(exception& e) {
764 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
768 //**********************************************************************************************************************
769 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
771 if (outputDir == "") { outputDir += m->hasPath(distfile); }
772 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
774 m->openOutputFile(fileroot+ tag + ".sabund", outSabund);
775 m->openOutputFile(fileroot+ tag + ".rabund", outRabund);
776 m->openOutputFile(fileroot+ tag + ".list", outList);
778 outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["list"].push_back(fileroot+ tag + ".list");
779 outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
780 outputNames.push_back(fileroot+ tag + ".list"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
782 map<float, int>::iterator itLabel;
784 //for each label needed
785 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
788 if (itLabel->first == -1) { thisLabel = "unique"; }
789 else { thisLabel = toString(itLabel->first, length-1); }
791 outList << thisLabel << '\t' << itLabel->second << '\t';
793 RAbundVector* rabund = new RAbundVector();
794 rabund->setLabel(thisLabel);
797 if (listSingle != NULL) {
798 for (int j = 0; j < listSingle->getNumBins(); j++) {
799 outList << listSingle->get(j) << '\t';
800 rabund->push_back(m->getNumNames(listSingle->get(j)));
804 //get the list info from each file
805 for (int k = 0; k < listNames.size(); k++) {
807 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); } delete rabund; return 0; }
809 InputData* input = new InputData(listNames[k], "list");
810 ListVector* list = input->getListVector(thisLabel);
812 //this file has reached the end
813 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
815 for (int j = 0; j < list->getNumBins(); j++) {
816 outList << list->get(j) << '\t';
817 rabund->push_back(m->getNumNames(list->get(j)));
824 SAbundVector sabund = rabund->getSAbundVector();
826 sabund.print(outSabund);
827 rabund->print(outRabund);
837 if (listSingle != NULL) { delete listSingle; }
839 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
843 catch(exception& e) {
844 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
849 //**********************************************************************************************************************
851 void ClusterSplitCommand::printData(ListVector* oldList){
853 string label = oldList->getLabel();
854 RAbundVector oldRAbund = oldList->getRAbundVector();
856 oldRAbund.setLabel(label);
857 if (m->isTrue(showabund)) {
858 oldRAbund.getSAbundVector().print(cout);
860 oldRAbund.print(outRabund);
861 oldRAbund.getSAbundVector().print(outSabund);
863 oldList->print(outList);
865 catch(exception& e) {
866 m->errorOut(e, "ClusterSplitCommand", "printData");
870 //**********************************************************************************************************************
871 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
874 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
879 //loop through and create all the processes you want
880 while (process != processors) {
884 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
888 vector<string> listFileNames = cluster(dividedNames[process], labels);
890 //write out names to file
891 string filename = toString(getpid()) + ".temp";
893 m->openOutputFile(filename, out);
895 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
900 filename = toString(getpid()) + ".temp.labels";
901 m->openOutputFile(filename, outLabels);
903 outLabels << cutoff << endl;
904 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
905 outLabels << (*it) << endl;
911 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
912 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
917 //force parent to wait until all the processes are done
918 for (int i=0;i<processors;i++) {
919 int temp = processIDS[i];
927 catch(exception& e) {
928 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
932 //**********************************************************************************************************************
934 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
937 SparseMatrix* matrix;
940 RAbundVector* rabund;
942 vector<string> listFileNames;
944 double smallestCutoff = cutoff;
946 //cluster each distance file
947 for (int i = 0; i < distNames.size(); i++) {
948 if (m->control_pressed) { return listFileNames; }
950 string thisNamefile = distNames[i].begin()->second;
951 string thisDistFile = distNames[i].begin()->first;
953 //read in distance file
954 globaldata->setNameFile(thisNamefile);
955 globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
959 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
961 //output your files too
963 cout << endl << "Reading " << thisDistFile << endl;
967 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
969 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
970 read->setCutoff(cutoff);
972 NameAssignment* nameMap = new NameAssignment(thisNamefile);
976 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
978 list = read->getListVector();
980 matrix = read->getMatrix();
987 //output your files too
989 cout << endl << "Clustering " << thisDistFile << endl;
993 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
995 rabund = new RAbundVector(list->getRAbundVector());
998 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
999 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
1000 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
1001 tag = cluster->getTag();
1003 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
1004 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
1007 m->openOutputFile(fileroot+ tag + ".list", listFile);
1009 listFileNames.push_back(fileroot+ tag + ".list");
1011 float previousDist = 0.00000;
1012 float rndPreviousDist = 0.00000;
1018 double saveCutoff = cutoff;
1020 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1022 if (m->control_pressed) { //clean up
1023 delete matrix; delete list; delete cluster; delete rabund;
1025 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1026 listFileNames.clear(); return listFileNames;
1029 cluster->update(saveCutoff);
1031 float dist = matrix->getSmallDist();
1034 rndDist = m->ceilDist(dist, precision);
1036 rndDist = m->roundDist(dist, precision);
1039 if(previousDist <= 0.0000 && dist != previousDist){
1040 oldList.setLabel("unique");
1041 oldList.print(listFile);
1042 if (labels.count("unique") == 0) { labels.insert("unique"); }
1044 else if(rndDist != rndPreviousDist){
1045 oldList.setLabel(toString(rndPreviousDist, length-1));
1046 oldList.print(listFile);
1047 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1050 previousDist = dist;
1051 rndPreviousDist = rndDist;
1056 if(previousDist <= 0.0000){
1057 oldList.setLabel("unique");
1058 oldList.print(listFile);
1059 if (labels.count("unique") == 0) { labels.insert("unique"); }
1061 else if(rndPreviousDist<cutoff){
1062 oldList.setLabel(toString(rndPreviousDist, length-1));
1063 oldList.print(listFile);
1064 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1067 delete matrix; delete list; delete cluster; delete rabund;
1070 if (m->control_pressed) { //clean up
1071 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1072 listFileNames.clear(); return listFileNames;
1075 remove(thisDistFile.c_str());
1076 remove(thisNamefile.c_str());
1078 if (saveCutoff != cutoff) {
1079 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1080 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1082 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1085 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1088 cutoff = smallestCutoff;
1090 return listFileNames;
1093 catch(exception& e) {
1094 m->errorOut(e, "ClusterSplitCommand", "cluster");
1100 //**********************************************************************************************************************
1102 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1107 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1112 string thisOutputDir = outputDir;
1113 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1114 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
1115 remove(outputFileName.c_str());
1118 for (int i = 0; i < distNames.size(); i++) {
1119 if (m->control_pressed) { return 0; }
1121 string thisDistFile = distNames[i].begin()->first;
1123 m->appendFiles(thisDistFile, outputFileName);
1126 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1136 catch(exception& e) {
1137 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1141 //**********************************************************************************************************************