2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
19 //**********************************************************************************************************************
20 vector<string> ClusterSplitCommand::getValidParameters(){
22 string AlignArray[] = {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
23 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
27 m->errorOut(e, "ClusterSplitCommand", "getValidParameters");
31 //**********************************************************************************************************************
32 ClusterSplitCommand::ClusterSplitCommand(){
35 //initialize outputTypes
36 vector<string> tempOutNames;
37 outputTypes["list"] = tempOutNames;
38 outputTypes["rabund"] = tempOutNames;
39 outputTypes["sabund"] = tempOutNames;
40 outputTypes["column"] = tempOutNames;
43 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
47 //**********************************************************************************************************************
48 vector<string> ClusterSplitCommand::getRequiredParameters(){
50 string Array[] = {"fasta","phylip","column","or"};
51 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
55 m->errorOut(e, "ClusterSplitCommand", "getRequiredParameters");
59 //**********************************************************************************************************************
60 vector<string> ClusterSplitCommand::getRequiredFiles(){
62 vector<string> myArray;
66 m->errorOut(e, "ClusterSplitCommand", "getRequiredFiles");
70 //**********************************************************************************************************************
71 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
72 ClusterSplitCommand::ClusterSplitCommand(string option) {
74 globaldata = GlobalData::getInstance();
78 //allow user to run help
79 if(option == "help") { help(); abort = true; }
82 //valid paramters for this command
83 string Array[] = {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
84 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
86 OptionParser parser(option);
87 map<string,string> parameters = parser.getParameters();
89 ValidParameters validParameter("cluster.split");
91 //check to make sure all parameters are valid for command
92 map<string,string>::iterator it;
93 for (it = parameters.begin(); it != parameters.end(); it++) {
94 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
99 //initialize outputTypes
100 vector<string> tempOutNames;
101 outputTypes["list"] = tempOutNames;
102 outputTypes["rabund"] = tempOutNames;
103 outputTypes["sabund"] = tempOutNames;
104 outputTypes["column"] = tempOutNames;
106 globaldata->newRead();
108 //if the user changes the output directory command factory will send this info to us in the output parameter
109 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
111 //if the user changes the input directory command factory will send this info to us in the output parameter
112 string inputDir = validParameter.validFile(parameters, "inputdir", false);
113 if (inputDir == "not found"){ inputDir = ""; }
116 it = parameters.find("phylip");
117 //user has given a template file
118 if(it != parameters.end()){
119 path = m->hasPath(it->second);
120 //if the user has not given a path then, add inputdir. else leave path alone.
121 if (path == "") { parameters["phylip"] = inputDir + it->second; }
124 it = parameters.find("column");
125 //user has given a template file
126 if(it != parameters.end()){
127 path = m->hasPath(it->second);
128 //if the user has not given a path then, add inputdir. else leave path alone.
129 if (path == "") { parameters["column"] = inputDir + it->second; }
132 it = parameters.find("name");
133 //user has given a template file
134 if(it != parameters.end()){
135 path = m->hasPath(it->second);
136 //if the user has not given a path then, add inputdir. else leave path alone.
137 if (path == "") { parameters["name"] = inputDir + it->second; }
140 it = parameters.find("taxonomy");
141 //user has given a template file
142 if(it != parameters.end()){
143 path = m->hasPath(it->second);
144 //if the user has not given a path then, add inputdir. else leave path alone.
145 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
148 it = parameters.find("fasta");
149 //user has given a template file
150 if(it != parameters.end()){
151 path = m->hasPath(it->second);
152 //if the user has not given a path then, add inputdir. else leave path alone.
153 if (path == "") { parameters["fasta"] = inputDir + it->second; }
157 //check for required parameters
158 phylipfile = validParameter.validFile(parameters, "phylip", true);
159 if (phylipfile == "not open") { abort = true; }
160 else if (phylipfile == "not found") { phylipfile = ""; }
161 else { distfile = phylipfile; format = "phylip"; }
163 columnfile = validParameter.validFile(parameters, "column", true);
164 if (columnfile == "not open") { abort = true; }
165 else if (columnfile == "not found") { columnfile = ""; }
166 else { distfile = columnfile; format = "column"; }
168 namefile = validParameter.validFile(parameters, "name", true);
169 if (namefile == "not open") { abort = true; }
170 else if (namefile == "not found") { namefile = ""; }
172 fastafile = validParameter.validFile(parameters, "fasta", true);
173 if (fastafile == "not open") { abort = true; }
174 else if (fastafile == "not found") { fastafile = ""; }
175 else { distfile = fastafile; splitmethod = "fasta"; }
177 taxFile = validParameter.validFile(parameters, "taxonomy", true);
178 if (taxFile == "not open") { abort = true; }
179 else if (taxFile == "not found") { taxFile = ""; }
181 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
182 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
184 if (columnfile != "") {
185 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
188 if (fastafile != "") {
189 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
190 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
193 //check for optional parameter and set defaults
194 // ...at some point should added some additional type checking...
195 //get user cutoff and precision or use defaults
197 temp = validParameter.validFile(parameters, "precision", false);
198 if (temp == "not found") { temp = "100"; }
199 //saves precision legnth for formatting below
200 length = temp.length();
201 convert(temp, precision);
203 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
204 hard = m->isTrue(temp);
206 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
207 large = m->isTrue(temp);
209 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
210 convert(temp, processors);
212 temp = validParameter.validFile(parameters, "splitmethod", false);
213 if (splitmethod != "fasta") {
214 if (temp == "not found") { splitmethod = "distance"; }
215 else { splitmethod = temp; }
218 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
219 convert(temp, cutoff);
220 cutoff += (5 / (precision * 10.0));
222 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "1"; }
223 convert(temp, taxLevelCutoff);
225 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; }
227 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
228 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
230 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
231 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
233 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
235 showabund = validParameter.validFile(parameters, "showabund", false);
236 if (showabund == "not found") { showabund = "T"; }
238 timing = validParameter.validFile(parameters, "timing", false);
239 if (timing == "not found") { timing = "F"; }
243 catch(exception& e) {
244 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
249 //**********************************************************************************************************************
251 void ClusterSplitCommand::help(){
253 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
254 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
255 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
256 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n");
257 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n");
258 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n");
259 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
260 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
261 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
262 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
263 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
264 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
265 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
266 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
267 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
268 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n");
269 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
271 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
273 m->mothurOut("The cluster.split command should be in the following format: \n");
274 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
275 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
278 catch(exception& e) {
279 m->errorOut(e, "ClusterSplitCommand", "help");
284 //**********************************************************************************************************************
286 ClusterSplitCommand::~ClusterSplitCommand(){}
288 //**********************************************************************************************************************
290 int ClusterSplitCommand::execute(){
293 if (abort == true) { return 0; }
296 vector<string> listFileNames;
298 string singletonName = "";
299 double saveCutoff = cutoff;
301 //****************** file prep work ******************************//
306 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
307 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
309 if (pid == 0) { //only process 0 converts and splits
313 //if user gave a phylip file convert to column file
314 if (format == "phylip") {
316 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
318 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
320 NameAssignment* nameMap = NULL;
321 convert->setFormat("phylip");
322 convert->read(nameMap);
324 if (m->control_pressed) { delete convert; return 0; }
326 distfile = convert->getOutputFile();
328 //if no names file given with phylip file, create it
329 ListVector* listToMakeNameFile = convert->getListVector();
330 if (namefile == "") { //you need to make a namefile for split matrix
332 namefile = phylipfile + ".names";
333 m->openOutputFile(namefile, out);
334 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
335 string bin = listToMakeNameFile->get(i);
336 out << bin << '\t' << bin << endl;
340 delete listToMakeNameFile;
343 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
345 if (m->control_pressed) { return 0; }
348 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
350 //split matrix into non-overlapping groups
352 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); }
353 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); }
354 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir); }
355 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
359 if (m->control_pressed) { delete split; return 0; }
361 singletonName = split->getSingletonNames();
362 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
365 //output a merged distance file
366 if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
369 if (m->control_pressed) { return 0; }
371 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
374 //****************** break up files between processes and cluster each file set ******************************//
376 ////you are process 0 from above////
378 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
379 dividedNames.resize(processors);
381 //for each file group figure out which process will complete it
382 //want to divide the load intelligently so the big files are spread between processes
384 for (int i = 0; i < distName.size(); i++) {
385 int processToAssign = (i+1) % processors;
386 if (processToAssign == 0) { processToAssign = processors; }
388 dividedNames[(processToAssign-1)].push_back(distName[i]);
391 //not lets reverse the order of ever other process, so we balance big files running with little ones
392 for (int i = 0; i < processors; i++) {
393 int remainder = ((i+1) % processors);
394 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
398 //send each child the list of files it needs to process
399 for(int i = 1; i < processors; i++) {
400 //send number of file pairs
401 int num = dividedNames[i].size();
402 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
404 for (int j = 0; j < num; j++) { //send filenames to process i
405 char tempDistFileName[1024];
406 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
407 int lengthDist = (dividedNames[i][j].begin()->first).length();
409 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
410 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
412 char tempNameFileName[1024];
413 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
414 int lengthName = (dividedNames[i][j].begin()->second).length();
416 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
417 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
422 listFileNames = cluster(dividedNames[0], labels);
424 //receive the other processes info
425 for(int i = 1; i < processors; i++) {
426 int num = dividedNames[i].size();
429 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
430 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
432 //send list filenames to root process
433 for (int j = 0; j < num; j++) {
435 char tempListFileName[1024];
437 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
438 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
440 string myListFileName = tempListFileName;
441 myListFileName = myListFileName.substr(0, lengthList);
443 listFileNames.push_back(myListFileName);
446 //send Labels to root process
448 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
450 for (int j = 0; j < numLabels; j++) {
454 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
455 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
457 string myLabel = tempLabel;
458 myLabel = myLabel.substr(0, lengthLabel);
460 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
464 }else { //you are a child process
465 vector < map<string, string> > myNames;
467 //recieve the files you need to process
468 //receive number of file pairs
470 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
474 for (int j = 0; j < num; j++) { //receive filenames to process
476 char tempDistFileName[1024];
478 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
479 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
481 string myDistFileName = tempDistFileName;
482 myDistFileName = myDistFileName.substr(0, lengthDist);
485 char tempNameFileName[1024];
487 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
488 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
490 string myNameFileName = tempNameFileName;
491 myNameFileName = myNameFileName.substr(0, lengthName);
494 myNames[j][myDistFileName] = myNameFileName;
498 listFileNames = cluster(myNames, labels);
501 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
503 //send list filenames to root process
504 for (int j = 0; j < num; j++) {
505 char tempListFileName[1024];
506 strcpy(tempListFileName, listFileNames[j].c_str());
507 int lengthList = listFileNames[j].length();
509 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
510 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
513 //send Labels to root process
514 int numLabels = labels.size();
515 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
517 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
519 strcpy(tempLabel, (*it).c_str());
520 int lengthLabel = (*it).length();
522 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
523 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
528 MPI_Barrier(MPI_COMM_WORLD);
532 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
534 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
536 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
537 dividedNames.resize(processors);
539 //for each file group figure out which process will complete it
540 //want to divide the load intelligently so the big files are spread between processes
542 for (int i = 0; i < distName.size(); i++) {
543 int processToAssign = (i+1) % processors;
544 if (processToAssign == 0) { processToAssign = processors; }
546 dividedNames[(processToAssign-1)].push_back(distName[i]);
549 //not lets reverse the order of ever other process, so we balance big files running with little ones
550 for (int i = 0; i < processors; i++) {
551 int remainder = ((i+1) % processors);
552 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
555 createProcesses(dividedNames);
557 if (m->control_pressed) { return 0; }
559 //get list of list file names from each process
560 for(int i=0;i<processors;i++){
561 string filename = toString(processIDS[i]) + ".temp";
563 m->openInputFile(filename, in);
565 in >> tag; m->gobble(in);
569 in >> tempName; m->gobble(in);
570 listFileNames.push_back(tempName);
573 remove((toString(processIDS[i]) + ".temp").c_str());
576 filename = toString(processIDS[i]) + ".temp.labels";
578 m->openInputFile(filename, in2);
581 in2 >> tempCutoff; m->gobble(in2);
582 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
586 in2 >> tempName; m->gobble(in2);
587 if (labels.count(tempName) == 0) { labels.insert(tempName); }
590 remove((toString(processIDS[i]) + ".temp.labels").c_str());
594 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
597 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
599 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
601 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
603 //****************** merge list file and create rabund and sabund files ******************************//
605 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
608 if (pid == 0) { //only process 0 merges
611 ListVector* listSingle;
612 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
614 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
616 mergeLists(listFileNames, labelBins, listSingle);
618 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
620 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
622 m->mothurOutEndLine();
623 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
624 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
625 m->mothurOutEndLine();
628 } //only process 0 merges
631 MPI_Barrier(MPI_COMM_WORLD);
636 catch(exception& e) {
637 m->errorOut(e, "ClusterSplitCommand", "execute");
641 //**********************************************************************************************************************
642 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
645 map<float, int> labelBin;
646 vector<float> orderFloat;
650 if (singleton != "none") {
652 m->openInputFile(singleton, in);
654 string firstCol, secondCol;
655 listSingle = new ListVector();
657 in >> firstCol >> secondCol; m->gobble(in);
658 listSingle->push_back(secondCol);
661 remove(singleton.c_str());
663 numSingleBins = listSingle->getNumBins();
664 }else{ listSingle = NULL; numSingleBins = 0; }
666 //go through users set and make them floats so we can sort them
667 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
670 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
671 else if (*it == "unique") { temp = -1.0; }
673 if (temp <= cutoff) {
674 orderFloat.push_back(temp);
675 labelBin[temp] = numSingleBins; //initialize numbins
680 sort(orderFloat.begin(), orderFloat.end());
683 //get the list info from each file
684 for (int k = 0; k < listNames.size(); k++) {
686 if (m->control_pressed) {
687 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str()); }
688 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
692 InputData* input = new InputData(listNames[k], "list");
693 ListVector* list = input->getListVector();
694 string lastLabel = list->getLabel();
696 string filledInList = listNames[k] + "filledInTemp";
698 m->openOutputFile(filledInList, outFilled);
700 //for each label needed
701 for(int l = 0; l < orderFloat.size(); l++){
704 if (orderFloat[l] == -1) { thisLabel = "unique"; }
705 else { thisLabel = toString(orderFloat[l], length-1); }
707 //this file has reached the end
709 list = input->getListVector(lastLabel, true);
710 }else{ //do you have the distance, or do you need to fill in
713 if (list->getLabel() == "unique") { labelFloat = -1.0; }
714 else { convert(list->getLabel(), labelFloat); }
716 //check for missing labels
717 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
718 //if its bigger get last label, otherwise keep it
720 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
722 lastLabel = list->getLabel();
726 list->setLabel(thisLabel);
727 list->print(outFilled);
730 labelBin[orderFloat[l]] += list->getNumBins();
734 list = input->getListVector();
737 if (list != NULL) { delete list; }
741 remove(listNames[k].c_str());
742 rename(filledInList.c_str(), listNames[k].c_str());
747 catch(exception& e) {
748 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
752 //**********************************************************************************************************************
753 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
755 if (outputDir == "") { outputDir += m->hasPath(distfile); }
756 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
758 m->openOutputFile(fileroot+ tag + ".sabund", outSabund);
759 m->openOutputFile(fileroot+ tag + ".rabund", outRabund);
760 m->openOutputFile(fileroot+ tag + ".list", outList);
762 outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["list"].push_back(fileroot+ tag + ".list");
763 outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
764 outputNames.push_back(fileroot+ tag + ".list"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
766 map<float, int>::iterator itLabel;
768 //for each label needed
769 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
772 if (itLabel->first == -1) { thisLabel = "unique"; }
773 else { thisLabel = toString(itLabel->first, length-1); }
775 outList << thisLabel << '\t' << itLabel->second << '\t';
777 RAbundVector* rabund = new RAbundVector();
778 rabund->setLabel(thisLabel);
781 if (listSingle != NULL) {
782 for (int j = 0; j < listSingle->getNumBins(); j++) {
783 outList << listSingle->get(j) << '\t';
784 rabund->push_back(m->getNumNames(listSingle->get(j)));
788 //get the list info from each file
789 for (int k = 0; k < listNames.size(); k++) {
791 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); } delete rabund; return 0; }
793 InputData* input = new InputData(listNames[k], "list");
794 ListVector* list = input->getListVector(thisLabel);
796 //this file has reached the end
797 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
799 for (int j = 0; j < list->getNumBins(); j++) {
800 outList << list->get(j) << '\t';
801 rabund->push_back(m->getNumNames(list->get(j)));
808 SAbundVector sabund = rabund->getSAbundVector();
810 sabund.print(outSabund);
811 rabund->print(outRabund);
821 if (listSingle != NULL) { delete listSingle; }
823 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
827 catch(exception& e) {
828 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
833 //**********************************************************************************************************************
835 void ClusterSplitCommand::printData(ListVector* oldList){
837 string label = oldList->getLabel();
838 RAbundVector oldRAbund = oldList->getRAbundVector();
840 oldRAbund.setLabel(label);
841 if (m->isTrue(showabund)) {
842 oldRAbund.getSAbundVector().print(cout);
844 oldRAbund.print(outRabund);
845 oldRAbund.getSAbundVector().print(outSabund);
847 oldList->print(outList);
849 catch(exception& e) {
850 m->errorOut(e, "ClusterSplitCommand", "printData");
854 //**********************************************************************************************************************
855 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
858 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
863 //loop through and create all the processes you want
864 while (process != processors) {
868 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
872 vector<string> listFileNames = cluster(dividedNames[process], labels);
874 //write out names to file
875 string filename = toString(getpid()) + ".temp";
877 m->openOutputFile(filename, out);
879 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
884 filename = toString(getpid()) + ".temp.labels";
885 m->openOutputFile(filename, outLabels);
887 outLabels << cutoff << endl;
888 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
889 outLabels << (*it) << endl;
895 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
896 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
901 //force parent to wait until all the processes are done
902 for (int i=0;i<processors;i++) {
903 int temp = processIDS[i];
911 catch(exception& e) {
912 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
916 //**********************************************************************************************************************
918 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
921 SparseMatrix* matrix;
924 RAbundVector* rabund;
926 vector<string> listFileNames;
928 double smallestCutoff = cutoff;
930 //cluster each distance file
931 for (int i = 0; i < distNames.size(); i++) {
932 if (m->control_pressed) { return listFileNames; }
934 string thisNamefile = distNames[i].begin()->second;
935 string thisDistFile = distNames[i].begin()->first;
937 //read in distance file
938 globaldata->setNameFile(thisNamefile);
939 globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
943 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
945 //output your files too
947 cout << endl << "Reading " << thisDistFile << endl;
951 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
953 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
954 read->setCutoff(cutoff);
956 NameAssignment* nameMap = new NameAssignment(thisNamefile);
960 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
962 list = read->getListVector();
964 matrix = read->getMatrix();
971 //output your files too
973 cout << endl << "Clustering " << thisDistFile << endl;
977 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
979 rabund = new RAbundVector(list->getRAbundVector());
982 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
983 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
984 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
985 tag = cluster->getTag();
987 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
988 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
991 m->openOutputFile(fileroot+ tag + ".list", listFile);
993 listFileNames.push_back(fileroot+ tag + ".list");
995 time_t estart = time(NULL);
997 float previousDist = 0.00000;
998 float rndPreviousDist = 0.00000;
1004 double saveCutoff = cutoff;
1006 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1008 if (m->control_pressed) { //clean up
1009 delete matrix; delete list; delete cluster; delete rabund;
1011 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1012 listFileNames.clear(); return listFileNames;
1015 cluster->update(saveCutoff);
1017 float dist = matrix->getSmallDist();
1020 rndDist = m->ceilDist(dist, precision);
1022 rndDist = m->roundDist(dist, precision);
1025 if(previousDist <= 0.0000 && dist != previousDist){
1026 oldList.setLabel("unique");
1027 oldList.print(listFile);
1028 if (labels.count("unique") == 0) { labels.insert("unique"); }
1030 else if(rndDist != rndPreviousDist){
1031 oldList.setLabel(toString(rndPreviousDist, length-1));
1032 oldList.print(listFile);
1033 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1036 previousDist = dist;
1037 rndPreviousDist = rndDist;
1042 if(previousDist <= 0.0000){
1043 oldList.setLabel("unique");
1044 oldList.print(listFile);
1045 if (labels.count("unique") == 0) { labels.insert("unique"); }
1047 else if(rndPreviousDist<cutoff){
1048 oldList.setLabel(toString(rndPreviousDist, length-1));
1049 oldList.print(listFile);
1050 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1053 delete matrix; delete list; delete cluster; delete rabund;
1056 if (m->control_pressed) { //clean up
1057 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1058 listFileNames.clear(); return listFileNames;
1061 remove(thisDistFile.c_str());
1062 remove(thisNamefile.c_str());
1064 if (saveCutoff != cutoff) {
1065 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1066 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1068 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1071 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1074 cutoff = smallestCutoff;
1076 return listFileNames;
1079 catch(exception& e) {
1080 m->errorOut(e, "ClusterSplitCommand", "cluster");
1086 //**********************************************************************************************************************
1088 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1093 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1098 string thisOutputDir = outputDir;
1099 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1100 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
1101 remove(outputFileName.c_str());
1104 for (int i = 0; i < distNames.size(); i++) {
1105 if (m->control_pressed) { return 0; }
1107 string thisDistFile = distNames[i].begin()->first;
1109 m->appendFiles(thisDistFile, outputFileName);
1112 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1122 catch(exception& e) {
1123 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1127 //**********************************************************************************************************************