2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
19 //**********************************************************************************************************************
20 vector<string> ClusterSplitCommand::getValidParameters(){
22 string AlignArray[] = {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
23 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
27 m->errorOut(e, "ClusterSplitCommand", "getValidParameters");
31 //**********************************************************************************************************************
32 ClusterSplitCommand::ClusterSplitCommand(){
34 abort = true; calledHelp = true;
35 vector<string> tempOutNames;
36 outputTypes["list"] = tempOutNames;
37 outputTypes["rabund"] = tempOutNames;
38 outputTypes["sabund"] = tempOutNames;
39 outputTypes["column"] = tempOutNames;
42 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
46 //**********************************************************************************************************************
47 vector<string> ClusterSplitCommand::getRequiredParameters(){
49 string Array[] = {"fasta","phylip","column","or"};
50 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
54 m->errorOut(e, "ClusterSplitCommand", "getRequiredParameters");
58 //**********************************************************************************************************************
59 vector<string> ClusterSplitCommand::getRequiredFiles(){
61 vector<string> myArray;
65 m->errorOut(e, "ClusterSplitCommand", "getRequiredFiles");
69 //**********************************************************************************************************************
70 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
71 ClusterSplitCommand::ClusterSplitCommand(string option) {
73 globaldata = GlobalData::getInstance();
74 abort = false; calledHelp = false;
77 //allow user to run help
78 if(option == "help") { help(); abort = true; calledHelp = true; }
81 //valid paramters for this command
82 string Array[] = {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
83 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
85 OptionParser parser(option);
86 map<string,string> parameters = parser.getParameters();
88 ValidParameters validParameter("cluster.split");
90 //check to make sure all parameters are valid for command
91 map<string,string>::iterator it;
92 for (it = parameters.begin(); it != parameters.end(); it++) {
93 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
98 //initialize outputTypes
99 vector<string> tempOutNames;
100 outputTypes["list"] = tempOutNames;
101 outputTypes["rabund"] = tempOutNames;
102 outputTypes["sabund"] = tempOutNames;
103 outputTypes["column"] = tempOutNames;
105 globaldata->newRead();
107 //if the user changes the output directory command factory will send this info to us in the output parameter
108 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
110 //if the user changes the input directory command factory will send this info to us in the output parameter
111 string inputDir = validParameter.validFile(parameters, "inputdir", false);
112 if (inputDir == "not found"){ inputDir = ""; }
115 it = parameters.find("phylip");
116 //user has given a template file
117 if(it != parameters.end()){
118 path = m->hasPath(it->second);
119 //if the user has not given a path then, add inputdir. else leave path alone.
120 if (path == "") { parameters["phylip"] = inputDir + it->second; }
123 it = parameters.find("column");
124 //user has given a template file
125 if(it != parameters.end()){
126 path = m->hasPath(it->second);
127 //if the user has not given a path then, add inputdir. else leave path alone.
128 if (path == "") { parameters["column"] = inputDir + it->second; }
131 it = parameters.find("name");
132 //user has given a template file
133 if(it != parameters.end()){
134 path = m->hasPath(it->second);
135 //if the user has not given a path then, add inputdir. else leave path alone.
136 if (path == "") { parameters["name"] = inputDir + it->second; }
139 it = parameters.find("taxonomy");
140 //user has given a template file
141 if(it != parameters.end()){
142 path = m->hasPath(it->second);
143 //if the user has not given a path then, add inputdir. else leave path alone.
144 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
147 it = parameters.find("fasta");
148 //user has given a template file
149 if(it != parameters.end()){
150 path = m->hasPath(it->second);
151 //if the user has not given a path then, add inputdir. else leave path alone.
152 if (path == "") { parameters["fasta"] = inputDir + it->second; }
156 //check for required parameters
157 phylipfile = validParameter.validFile(parameters, "phylip", true);
158 if (phylipfile == "not open") { abort = true; }
159 else if (phylipfile == "not found") { phylipfile = ""; }
160 else { distfile = phylipfile; format = "phylip"; }
162 columnfile = validParameter.validFile(parameters, "column", true);
163 if (columnfile == "not open") { abort = true; }
164 else if (columnfile == "not found") { columnfile = ""; }
165 else { distfile = columnfile; format = "column"; }
167 namefile = validParameter.validFile(parameters, "name", true);
168 if (namefile == "not open") { abort = true; }
169 else if (namefile == "not found") { namefile = ""; }
171 fastafile = validParameter.validFile(parameters, "fasta", true);
172 if (fastafile == "not open") { abort = true; }
173 else if (fastafile == "not found") { fastafile = ""; }
174 else { distfile = fastafile; splitmethod = "fasta"; }
176 taxFile = validParameter.validFile(parameters, "taxonomy", true);
177 if (taxFile == "not open") { abort = true; }
178 else if (taxFile == "not found") { taxFile = ""; }
180 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
181 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
183 if (columnfile != "") {
184 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
187 if (fastafile != "") {
188 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
189 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
192 //check for optional parameter and set defaults
193 // ...at some point should added some additional type checking...
194 //get user cutoff and precision or use defaults
196 temp = validParameter.validFile(parameters, "precision", false);
197 if (temp == "not found") { temp = "100"; }
198 //saves precision legnth for formatting below
199 length = temp.length();
200 convert(temp, precision);
202 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
203 hard = m->isTrue(temp);
205 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
206 large = m->isTrue(temp);
208 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
209 convert(temp, processors);
211 temp = validParameter.validFile(parameters, "splitmethod", false);
212 if (splitmethod != "fasta") {
213 if (temp == "not found") { splitmethod = "distance"; }
214 else { splitmethod = temp; }
217 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
218 convert(temp, cutoff);
219 cutoff += (5 / (precision * 10.0));
221 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "1"; }
222 convert(temp, taxLevelCutoff);
224 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; }
226 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
227 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
229 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
230 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
232 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
234 showabund = validParameter.validFile(parameters, "showabund", false);
235 if (showabund == "not found") { showabund = "T"; }
237 timing = validParameter.validFile(parameters, "timing", false);
238 if (timing == "not found") { timing = "F"; }
242 catch(exception& e) {
243 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
248 //**********************************************************************************************************************
250 void ClusterSplitCommand::help(){
252 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
253 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
254 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
255 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n");
256 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n");
257 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n");
258 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
259 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
260 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
261 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
262 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
263 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
264 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
265 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
266 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
267 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n");
268 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
270 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
272 m->mothurOut("The cluster.split command should be in the following format: \n");
273 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
274 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
277 catch(exception& e) {
278 m->errorOut(e, "ClusterSplitCommand", "help");
283 //**********************************************************************************************************************
285 ClusterSplitCommand::~ClusterSplitCommand(){}
287 //**********************************************************************************************************************
289 int ClusterSplitCommand::execute(){
292 if (abort == true) { if (calledHelp) { return 0; } return 2; }
295 vector<string> listFileNames;
297 string singletonName = "";
298 double saveCutoff = cutoff;
300 //****************** file prep work ******************************//
305 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
306 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
308 if (pid == 0) { //only process 0 converts and splits
312 //if user gave a phylip file convert to column file
313 if (format == "phylip") {
315 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
317 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
319 NameAssignment* nameMap = NULL;
320 convert->setFormat("phylip");
321 convert->read(nameMap);
323 if (m->control_pressed) { delete convert; return 0; }
325 distfile = convert->getOutputFile();
327 //if no names file given with phylip file, create it
328 ListVector* listToMakeNameFile = convert->getListVector();
329 if (namefile == "") { //you need to make a namefile for split matrix
331 namefile = phylipfile + ".names";
332 m->openOutputFile(namefile, out);
333 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
334 string bin = listToMakeNameFile->get(i);
335 out << bin << '\t' << bin << endl;
339 delete listToMakeNameFile;
342 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
344 if (m->control_pressed) { return 0; }
347 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
349 //split matrix into non-overlapping groups
351 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); }
352 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); }
353 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir); }
354 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
358 if (m->control_pressed) { delete split; return 0; }
360 singletonName = split->getSingletonNames();
361 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
364 //output a merged distance file
365 if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
368 if (m->control_pressed) { return 0; }
370 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
373 //****************** break up files between processes and cluster each file set ******************************//
375 ////you are process 0 from above////
377 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
378 dividedNames.resize(processors);
380 //for each file group figure out which process will complete it
381 //want to divide the load intelligently so the big files are spread between processes
382 for (int i = 0; i < distName.size(); i++) {
383 int processToAssign = (i+1) % processors;
384 if (processToAssign == 0) { processToAssign = processors; }
386 dividedNames[(processToAssign-1)].push_back(distName[i]);
389 //not lets reverse the order of ever other process, so we balance big files running with little ones
390 for (int i = 0; i < processors; i++) {
391 int remainder = ((i+1) % processors);
392 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
396 //send each child the list of files it needs to process
397 for(int i = 1; i < processors; i++) {
398 //send number of file pairs
399 int num = dividedNames[i].size();
400 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
402 for (int j = 0; j < num; j++) { //send filenames to process i
403 char tempDistFileName[1024];
404 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
405 int lengthDist = (dividedNames[i][j].begin()->first).length();
407 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
408 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
410 char tempNameFileName[1024];
411 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
412 int lengthName = (dividedNames[i][j].begin()->second).length();
414 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
415 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
420 listFileNames = cluster(dividedNames[0], labels);
422 //receive the other processes info
423 for(int i = 1; i < processors; i++) {
424 int num = dividedNames[i].size();
427 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
428 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
430 //send list filenames to root process
431 for (int j = 0; j < num; j++) {
433 char tempListFileName[1024];
435 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
436 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
438 string myListFileName = tempListFileName;
439 myListFileName = myListFileName.substr(0, lengthList);
441 listFileNames.push_back(myListFileName);
444 //send Labels to root process
446 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
448 for (int j = 0; j < numLabels; j++) {
452 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
453 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
455 string myLabel = tempLabel;
456 myLabel = myLabel.substr(0, lengthLabel);
458 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
462 }else { //you are a child process
463 vector < map<string, string> > myNames;
465 //recieve the files you need to process
466 //receive number of file pairs
468 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
472 for (int j = 0; j < num; j++) { //receive filenames to process
474 char tempDistFileName[1024];
476 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
477 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
479 string myDistFileName = tempDistFileName;
480 myDistFileName = myDistFileName.substr(0, lengthDist);
483 char tempNameFileName[1024];
485 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
486 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
488 string myNameFileName = tempNameFileName;
489 myNameFileName = myNameFileName.substr(0, lengthName);
492 myNames[j][myDistFileName] = myNameFileName;
496 listFileNames = cluster(myNames, labels);
499 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
501 //send list filenames to root process
502 for (int j = 0; j < num; j++) {
503 char tempListFileName[1024];
504 strcpy(tempListFileName, listFileNames[j].c_str());
505 int lengthList = listFileNames[j].length();
507 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
508 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
511 //send Labels to root process
512 int numLabels = labels.size();
513 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
515 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
517 strcpy(tempLabel, (*it).c_str());
518 int lengthLabel = (*it).length();
520 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
521 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
526 MPI_Barrier(MPI_COMM_WORLD);
530 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
532 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
534 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
535 dividedNames.resize(processors);
537 //for each file group figure out which process will complete it
538 //want to divide the load intelligently so the big files are spread between processes
539 for (int i = 0; i < distName.size(); i++) {
540 int processToAssign = (i+1) % processors;
541 if (processToAssign == 0) { processToAssign = processors; }
543 dividedNames[(processToAssign-1)].push_back(distName[i]);
546 //not lets reverse the order of ever other process, so we balance big files running with little ones
547 for (int i = 0; i < processors; i++) {
548 int remainder = ((i+1) % processors);
549 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
552 createProcesses(dividedNames);
554 if (m->control_pressed) { return 0; }
556 //get list of list file names from each process
557 for(int i=0;i<processors;i++){
558 string filename = toString(processIDS[i]) + ".temp";
560 m->openInputFile(filename, in);
562 in >> tag; m->gobble(in);
566 in >> tempName; m->gobble(in);
567 listFileNames.push_back(tempName);
570 remove((toString(processIDS[i]) + ".temp").c_str());
573 filename = toString(processIDS[i]) + ".temp.labels";
575 m->openInputFile(filename, in2);
578 in2 >> tempCutoff; m->gobble(in2);
579 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
583 in2 >> tempName; m->gobble(in2);
584 if (labels.count(tempName) == 0) { labels.insert(tempName); }
587 remove((toString(processIDS[i]) + ".temp.labels").c_str());
591 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
594 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
596 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
598 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
600 //****************** merge list file and create rabund and sabund files ******************************//
602 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
605 if (pid == 0) { //only process 0 merges
608 ListVector* listSingle;
609 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
611 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
613 mergeLists(listFileNames, labelBins, listSingle);
615 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
617 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
619 m->mothurOutEndLine();
620 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
621 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
622 m->mothurOutEndLine();
625 } //only process 0 merges
628 MPI_Barrier(MPI_COMM_WORLD);
633 catch(exception& e) {
634 m->errorOut(e, "ClusterSplitCommand", "execute");
638 //**********************************************************************************************************************
639 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
642 map<float, int> labelBin;
643 vector<float> orderFloat;
647 if (singleton != "none") {
649 m->openInputFile(singleton, in);
651 string firstCol, secondCol;
652 listSingle = new ListVector();
654 in >> firstCol >> secondCol; m->gobble(in);
655 listSingle->push_back(secondCol);
658 remove(singleton.c_str());
660 numSingleBins = listSingle->getNumBins();
661 }else{ listSingle = NULL; numSingleBins = 0; }
663 //go through users set and make them floats so we can sort them
664 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
667 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
668 else if (*it == "unique") { temp = -1.0; }
670 if (temp <= cutoff) {
671 orderFloat.push_back(temp);
672 labelBin[temp] = numSingleBins; //initialize numbins
677 sort(orderFloat.begin(), orderFloat.end());
680 //get the list info from each file
681 for (int k = 0; k < listNames.size(); k++) {
683 if (m->control_pressed) {
684 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str()); }
685 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
689 InputData* input = new InputData(listNames[k], "list");
690 ListVector* list = input->getListVector();
691 string lastLabel = list->getLabel();
693 string filledInList = listNames[k] + "filledInTemp";
695 m->openOutputFile(filledInList, outFilled);
697 //for each label needed
698 for(int l = 0; l < orderFloat.size(); l++){
701 if (orderFloat[l] == -1) { thisLabel = "unique"; }
702 else { thisLabel = toString(orderFloat[l], length-1); }
704 //this file has reached the end
706 list = input->getListVector(lastLabel, true);
707 }else{ //do you have the distance, or do you need to fill in
710 if (list->getLabel() == "unique") { labelFloat = -1.0; }
711 else { convert(list->getLabel(), labelFloat); }
713 //check for missing labels
714 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
715 //if its bigger get last label, otherwise keep it
717 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
719 lastLabel = list->getLabel();
723 list->setLabel(thisLabel);
724 list->print(outFilled);
727 labelBin[orderFloat[l]] += list->getNumBins();
731 list = input->getListVector();
734 if (list != NULL) { delete list; }
738 remove(listNames[k].c_str());
739 rename(filledInList.c_str(), listNames[k].c_str());
744 catch(exception& e) {
745 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
749 //**********************************************************************************************************************
750 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
752 if (outputDir == "") { outputDir += m->hasPath(distfile); }
753 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
755 m->openOutputFile(fileroot+ tag + ".sabund", outSabund);
756 m->openOutputFile(fileroot+ tag + ".rabund", outRabund);
757 m->openOutputFile(fileroot+ tag + ".list", outList);
759 outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["list"].push_back(fileroot+ tag + ".list");
760 outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
761 outputNames.push_back(fileroot+ tag + ".list"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
763 map<float, int>::iterator itLabel;
765 //for each label needed
766 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
769 if (itLabel->first == -1) { thisLabel = "unique"; }
770 else { thisLabel = toString(itLabel->first, length-1); }
772 outList << thisLabel << '\t' << itLabel->second << '\t';
774 RAbundVector* rabund = new RAbundVector();
775 rabund->setLabel(thisLabel);
778 if (listSingle != NULL) {
779 for (int j = 0; j < listSingle->getNumBins(); j++) {
780 outList << listSingle->get(j) << '\t';
781 rabund->push_back(m->getNumNames(listSingle->get(j)));
785 //get the list info from each file
786 for (int k = 0; k < listNames.size(); k++) {
788 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); } delete rabund; return 0; }
790 InputData* input = new InputData(listNames[k], "list");
791 ListVector* list = input->getListVector(thisLabel);
793 //this file has reached the end
794 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
796 for (int j = 0; j < list->getNumBins(); j++) {
797 outList << list->get(j) << '\t';
798 rabund->push_back(m->getNumNames(list->get(j)));
805 SAbundVector sabund = rabund->getSAbundVector();
807 sabund.print(outSabund);
808 rabund->print(outRabund);
818 if (listSingle != NULL) { delete listSingle; }
820 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
824 catch(exception& e) {
825 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
830 //**********************************************************************************************************************
832 void ClusterSplitCommand::printData(ListVector* oldList){
834 string label = oldList->getLabel();
835 RAbundVector oldRAbund = oldList->getRAbundVector();
837 oldRAbund.setLabel(label);
838 if (m->isTrue(showabund)) {
839 oldRAbund.getSAbundVector().print(cout);
841 oldRAbund.print(outRabund);
842 oldRAbund.getSAbundVector().print(outSabund);
844 oldList->print(outList);
846 catch(exception& e) {
847 m->errorOut(e, "ClusterSplitCommand", "printData");
851 //**********************************************************************************************************************
852 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
855 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
860 //loop through and create all the processes you want
861 while (process != processors) {
865 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
869 vector<string> listFileNames = cluster(dividedNames[process], labels);
871 //write out names to file
872 string filename = toString(getpid()) + ".temp";
874 m->openOutputFile(filename, out);
876 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
881 filename = toString(getpid()) + ".temp.labels";
882 m->openOutputFile(filename, outLabels);
884 outLabels << cutoff << endl;
885 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
886 outLabels << (*it) << endl;
892 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
893 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
898 //force parent to wait until all the processes are done
899 for (int i=0;i<processors;i++) {
900 int temp = processIDS[i];
908 catch(exception& e) {
909 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
913 //**********************************************************************************************************************
915 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
918 SparseMatrix* matrix;
921 RAbundVector* rabund;
923 vector<string> listFileNames;
925 double smallestCutoff = cutoff;
927 //cluster each distance file
928 for (int i = 0; i < distNames.size(); i++) {
929 if (m->control_pressed) { return listFileNames; }
931 string thisNamefile = distNames[i].begin()->second;
932 string thisDistFile = distNames[i].begin()->first;
934 //read in distance file
935 globaldata->setNameFile(thisNamefile);
936 globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
940 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
942 //output your files too
944 cout << endl << "Reading " << thisDistFile << endl;
948 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
950 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
951 read->setCutoff(cutoff);
953 NameAssignment* nameMap = new NameAssignment(thisNamefile);
957 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
959 list = read->getListVector();
961 matrix = read->getMatrix();
968 //output your files too
970 cout << endl << "Clustering " << thisDistFile << endl;
974 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
976 rabund = new RAbundVector(list->getRAbundVector());
979 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
980 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
981 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
982 tag = cluster->getTag();
984 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
985 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
988 m->openOutputFile(fileroot+ tag + ".list", listFile);
990 listFileNames.push_back(fileroot+ tag + ".list");
992 float previousDist = 0.00000;
993 float rndPreviousDist = 0.00000;
999 double saveCutoff = cutoff;
1001 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1003 if (m->control_pressed) { //clean up
1004 delete matrix; delete list; delete cluster; delete rabund;
1006 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1007 listFileNames.clear(); return listFileNames;
1010 cluster->update(saveCutoff);
1012 float dist = matrix->getSmallDist();
1015 rndDist = m->ceilDist(dist, precision);
1017 rndDist = m->roundDist(dist, precision);
1020 if(previousDist <= 0.0000 && dist != previousDist){
1021 oldList.setLabel("unique");
1022 oldList.print(listFile);
1023 if (labels.count("unique") == 0) { labels.insert("unique"); }
1025 else if(rndDist != rndPreviousDist){
1026 oldList.setLabel(toString(rndPreviousDist, length-1));
1027 oldList.print(listFile);
1028 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1031 previousDist = dist;
1032 rndPreviousDist = rndDist;
1037 if(previousDist <= 0.0000){
1038 oldList.setLabel("unique");
1039 oldList.print(listFile);
1040 if (labels.count("unique") == 0) { labels.insert("unique"); }
1042 else if(rndPreviousDist<cutoff){
1043 oldList.setLabel(toString(rndPreviousDist, length-1));
1044 oldList.print(listFile);
1045 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1048 delete matrix; delete list; delete cluster; delete rabund;
1051 if (m->control_pressed) { //clean up
1052 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1053 listFileNames.clear(); return listFileNames;
1056 remove(thisDistFile.c_str());
1057 remove(thisNamefile.c_str());
1059 if (saveCutoff != cutoff) {
1060 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1061 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1063 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1066 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1069 cutoff = smallestCutoff;
1071 return listFileNames;
1074 catch(exception& e) {
1075 m->errorOut(e, "ClusterSplitCommand", "cluster");
1081 //**********************************************************************************************************************
1083 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1088 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1093 string thisOutputDir = outputDir;
1094 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1095 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
1096 remove(outputFileName.c_str());
1099 for (int i = 0; i < distNames.size(); i++) {
1100 if (m->control_pressed) { return 0; }
1102 string thisDistFile = distNames[i].begin()->first;
1104 m->appendFiles(thisDistFile, outputFileName);
1107 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1117 catch(exception& e) {
1118 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1122 //**********************************************************************************************************************