2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
19 //**********************************************************************************************************************
20 vector<string> ClusterSplitCommand::getValidParameters(){
22 string AlignArray[] = {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
23 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
27 m->errorOut(e, "ClusterSplitCommand", "getValidParameters");
31 //**********************************************************************************************************************
32 ClusterSplitCommand::ClusterSplitCommand(){
35 //initialize outputTypes
36 vector<string> tempOutNames;
37 outputTypes["list"] = tempOutNames;
38 outputTypes["rabund"] = tempOutNames;
39 outputTypes["sabund"] = tempOutNames;
40 outputTypes["column"] = tempOutNames;
43 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
47 //**********************************************************************************************************************
48 vector<string> ClusterSplitCommand::getRequiredParameters(){
50 string Array[] = {"fasta","phylip","column","or"};
51 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
55 m->errorOut(e, "ClusterSplitCommand", "getRequiredParameters");
59 //**********************************************************************************************************************
60 vector<string> ClusterSplitCommand::getRequiredFiles(){
62 vector<string> myArray;
66 m->errorOut(e, "ClusterSplitCommand", "getRequiredFiles");
70 //**********************************************************************************************************************
71 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
72 ClusterSplitCommand::ClusterSplitCommand(string option) {
74 globaldata = GlobalData::getInstance();
78 //allow user to run help
79 if(option == "help") { help(); abort = true; }
82 //valid paramters for this command
83 string Array[] = {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
84 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
86 OptionParser parser(option);
87 map<string,string> parameters = parser.getParameters();
89 ValidParameters validParameter("cluster.split");
91 //check to make sure all parameters are valid for command
92 map<string,string>::iterator it;
93 for (it = parameters.begin(); it != parameters.end(); it++) {
94 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
99 //initialize outputTypes
100 vector<string> tempOutNames;
101 outputTypes["list"] = tempOutNames;
102 outputTypes["rabund"] = tempOutNames;
103 outputTypes["sabund"] = tempOutNames;
104 outputTypes["column"] = tempOutNames;
106 globaldata->newRead();
108 //if the user changes the output directory command factory will send this info to us in the output parameter
109 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
111 //if the user changes the input directory command factory will send this info to us in the output parameter
112 string inputDir = validParameter.validFile(parameters, "inputdir", false);
113 if (inputDir == "not found"){ inputDir = ""; }
116 it = parameters.find("phylip");
117 //user has given a template file
118 if(it != parameters.end()){
119 path = m->hasPath(it->second);
120 //if the user has not given a path then, add inputdir. else leave path alone.
121 if (path == "") { parameters["phylip"] = inputDir + it->second; }
124 it = parameters.find("column");
125 //user has given a template file
126 if(it != parameters.end()){
127 path = m->hasPath(it->second);
128 //if the user has not given a path then, add inputdir. else leave path alone.
129 if (path == "") { parameters["column"] = inputDir + it->second; }
132 it = parameters.find("name");
133 //user has given a template file
134 if(it != parameters.end()){
135 path = m->hasPath(it->second);
136 //if the user has not given a path then, add inputdir. else leave path alone.
137 if (path == "") { parameters["name"] = inputDir + it->second; }
140 it = parameters.find("taxonomy");
141 //user has given a template file
142 if(it != parameters.end()){
143 path = m->hasPath(it->second);
144 //if the user has not given a path then, add inputdir. else leave path alone.
145 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
148 it = parameters.find("fasta");
149 //user has given a template file
150 if(it != parameters.end()){
151 path = m->hasPath(it->second);
152 //if the user has not given a path then, add inputdir. else leave path alone.
153 if (path == "") { parameters["fasta"] = inputDir + it->second; }
157 //check for required parameters
158 phylipfile = validParameter.validFile(parameters, "phylip", true);
159 if (phylipfile == "not open") { abort = true; }
160 else if (phylipfile == "not found") { phylipfile = ""; }
161 else { distfile = phylipfile; format = "phylip"; }
163 columnfile = validParameter.validFile(parameters, "column", true);
164 if (columnfile == "not open") { abort = true; }
165 else if (columnfile == "not found") { columnfile = ""; }
166 else { distfile = columnfile; format = "column"; }
168 namefile = validParameter.validFile(parameters, "name", true);
169 if (namefile == "not open") { abort = true; }
170 else if (namefile == "not found") { namefile = ""; }
172 fastafile = validParameter.validFile(parameters, "fasta", true);
173 if (fastafile == "not open") { abort = true; }
174 else if (fastafile == "not found") { fastafile = ""; }
175 else { distfile = fastafile; splitmethod = "fasta"; }
177 taxFile = validParameter.validFile(parameters, "taxonomy", true);
178 if (taxFile == "not open") { abort = true; }
179 else if (taxFile == "not found") { taxFile = ""; }
181 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
182 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
184 if (columnfile != "") {
185 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
188 if (fastafile != "") {
189 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
190 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
193 //check for optional parameter and set defaults
194 // ...at some point should added some additional type checking...
195 //get user cutoff and precision or use defaults
197 temp = validParameter.validFile(parameters, "precision", false);
198 if (temp == "not found") { temp = "100"; }
199 //saves precision legnth for formatting below
200 length = temp.length();
201 convert(temp, precision);
203 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
204 hard = m->isTrue(temp);
206 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
207 large = m->isTrue(temp);
209 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
210 convert(temp, processors);
212 temp = validParameter.validFile(parameters, "splitmethod", false);
213 if (splitmethod != "fasta") {
214 if (temp == "not found") { splitmethod = "distance"; }
215 else { splitmethod = temp; }
218 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
219 convert(temp, cutoff);
220 cutoff += (5 / (precision * 10.0));
222 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "1"; }
223 convert(temp, taxLevelCutoff);
225 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; }
227 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
228 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
230 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
231 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
233 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
235 showabund = validParameter.validFile(parameters, "showabund", false);
236 if (showabund == "not found") { showabund = "T"; }
238 timing = validParameter.validFile(parameters, "timing", false);
239 if (timing == "not found") { timing = "F"; }
243 catch(exception& e) {
244 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
249 //**********************************************************************************************************************
251 void ClusterSplitCommand::help(){
253 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
254 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
255 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
256 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n");
257 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n");
258 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n");
259 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
260 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
261 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
262 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
263 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
264 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
265 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
266 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
267 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
268 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n");
269 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
271 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
273 m->mothurOut("The cluster.split command should be in the following format: \n");
274 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
275 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
278 catch(exception& e) {
279 m->errorOut(e, "ClusterSplitCommand", "help");
284 //**********************************************************************************************************************
286 ClusterSplitCommand::~ClusterSplitCommand(){}
288 //**********************************************************************************************************************
290 int ClusterSplitCommand::execute(){
293 if (abort == true) { return 0; }
296 vector<string> listFileNames;
298 string singletonName = "";
299 double saveCutoff = cutoff;
301 //****************** file prep work ******************************//
306 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
307 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
309 if (pid == 0) { //only process 0 converts and splits
313 //if user gave a phylip file convert to column file
314 if (format == "phylip") {
316 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
318 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
320 NameAssignment* nameMap = NULL;
321 convert->setFormat("phylip");
322 convert->read(nameMap);
324 if (m->control_pressed) { delete convert; return 0; }
326 distfile = convert->getOutputFile();
328 //if no names file given with phylip file, create it
329 ListVector* listToMakeNameFile = convert->getListVector();
330 if (namefile == "") { //you need to make a namefile for split matrix
332 namefile = phylipfile + ".names";
333 m->openOutputFile(namefile, out);
334 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
335 string bin = listToMakeNameFile->get(i);
336 out << bin << '\t' << bin << endl;
340 delete listToMakeNameFile;
343 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
345 if (m->control_pressed) { return 0; }
348 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
350 //split matrix into non-overlapping groups
352 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); }
353 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); }
354 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir); }
355 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
359 if (m->control_pressed) { delete split; return 0; }
361 singletonName = split->getSingletonNames();
362 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
365 //output a merged distance file
366 if (splitmethod == "fasta") { createMergedDistanceFile(distName); }
369 if (m->control_pressed) { return 0; }
371 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
374 //****************** break up files between processes and cluster each file set ******************************//
376 ////you are process 0 from above////
378 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
379 dividedNames.resize(processors);
381 //for each file group figure out which process will complete it
382 //want to divide the load intelligently so the big files are spread between processes
384 for (int i = 0; i < distName.size(); i++) {
385 int processToAssign = (i+1) % processors;
386 if (processToAssign == 0) { processToAssign = processors; }
388 dividedNames[(processToAssign-1)].push_back(distName[i]);
391 //not lets reverse the order of ever other process, so we balance big files running with little ones
392 for (int i = 0; i < processors; i++) {
393 int remainder = ((i+1) % processors);
394 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
398 //send each child the list of files it needs to process
399 for(int i = 1; i < processors; i++) {
400 //send number of file pairs
401 int num = dividedNames[i].size();
402 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
404 for (int j = 0; j < num; j++) { //send filenames to process i
405 char tempDistFileName[1024];
406 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
407 int lengthDist = (dividedNames[i][j].begin()->first).length();
409 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
410 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
412 char tempNameFileName[1024];
413 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
414 int lengthName = (dividedNames[i][j].begin()->second).length();
416 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
417 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
422 listFileNames = cluster(dividedNames[0], labels);
424 //receive the other processes info
425 for(int i = 1; i < processors; i++) {
426 int num = dividedNames[i].size();
429 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
430 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
432 //send list filenames to root process
433 for (int j = 0; j < num; j++) {
435 char tempListFileName[1024];
437 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
438 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
440 string myListFileName = tempListFileName;
441 myListFileName = myListFileName.substr(0, lengthList);
443 listFileNames.push_back(myListFileName);
446 //send Labels to root process
448 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
450 for (int j = 0; j < numLabels; j++) {
454 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
455 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
457 string myLabel = tempLabel;
458 myLabel = myLabel.substr(0, lengthLabel);
460 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
464 }else { //you are a child process
465 vector < map<string, string> > myNames;
467 //recieve the files you need to process
468 //receive number of file pairs
470 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
474 for (int j = 0; j < num; j++) { //receive filenames to process
476 char tempDistFileName[1024];
478 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
479 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
481 string myDistFileName = tempDistFileName;
482 myDistFileName = myDistFileName.substr(0, lengthDist);
485 char tempNameFileName[1024];
487 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
488 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
490 string myNameFileName = tempNameFileName;
491 myNameFileName = myNameFileName.substr(0, lengthName);
494 myNames[j][myDistFileName] = myNameFileName;
498 listFileNames = cluster(myNames, labels);
501 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
503 //send list filenames to root process
504 for (int j = 0; j < num; j++) {
505 char tempListFileName[1024];
506 strcpy(tempListFileName, listFileNames[j].c_str());
507 int lengthList = listFileNames[j].length();
509 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
510 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
513 //send Labels to root process
514 int numLabels = labels.size();
515 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
517 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
519 strcpy(tempLabel, (*it).c_str());
520 int lengthLabel = (*it).length();
522 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
523 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
528 MPI_Barrier(MPI_COMM_WORLD);
532 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
534 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
536 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
537 dividedNames.resize(processors);
539 //for each file group figure out which process will complete it
540 //want to divide the load intelligently so the big files are spread between processes
541 for (int i = 0; i < distName.size(); i++) {
542 int processToAssign = (i+1) % processors;
543 if (processToAssign == 0) { processToAssign = processors; }
545 dividedNames[(processToAssign-1)].push_back(distName[i]);
548 //not lets reverse the order of ever other process, so we balance big files running with little ones
549 for (int i = 0; i < processors; i++) {
550 int remainder = ((i+1) % processors);
551 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
554 createProcesses(dividedNames);
556 if (m->control_pressed) { return 0; }
558 //get list of list file names from each process
559 for(int i=0;i<processors;i++){
560 string filename = toString(processIDS[i]) + ".temp";
562 m->openInputFile(filename, in);
564 in >> tag; m->gobble(in);
568 in >> tempName; m->gobble(in);
569 listFileNames.push_back(tempName);
572 remove((toString(processIDS[i]) + ".temp").c_str());
575 filename = toString(processIDS[i]) + ".temp.labels";
577 m->openInputFile(filename, in2);
580 in2 >> tempCutoff; m->gobble(in2);
581 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
585 in2 >> tempName; m->gobble(in2);
586 if (labels.count(tempName) == 0) { labels.insert(tempName); }
589 remove((toString(processIDS[i]) + ".temp.labels").c_str());
593 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
596 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
598 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
600 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
602 //****************** merge list file and create rabund and sabund files ******************************//
604 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
607 if (pid == 0) { //only process 0 merges
610 ListVector* listSingle;
611 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
613 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
615 mergeLists(listFileNames, labelBins, listSingle);
617 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
619 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
621 m->mothurOutEndLine();
622 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
623 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
624 m->mothurOutEndLine();
627 } //only process 0 merges
630 MPI_Barrier(MPI_COMM_WORLD);
635 catch(exception& e) {
636 m->errorOut(e, "ClusterSplitCommand", "execute");
640 //**********************************************************************************************************************
641 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
644 map<float, int> labelBin;
645 vector<float> orderFloat;
649 if (singleton != "none") {
651 m->openInputFile(singleton, in);
653 string firstCol, secondCol;
654 listSingle = new ListVector();
656 in >> firstCol >> secondCol; m->gobble(in);
657 listSingle->push_back(secondCol);
660 remove(singleton.c_str());
662 numSingleBins = listSingle->getNumBins();
663 }else{ listSingle = NULL; numSingleBins = 0; }
665 //go through users set and make them floats so we can sort them
666 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
669 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
670 else if (*it == "unique") { temp = -1.0; }
672 if (temp <= cutoff) {
673 orderFloat.push_back(temp);
674 labelBin[temp] = numSingleBins; //initialize numbins
679 sort(orderFloat.begin(), orderFloat.end());
682 //get the list info from each file
683 for (int k = 0; k < listNames.size(); k++) {
685 if (m->control_pressed) {
686 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str()); }
687 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
691 InputData* input = new InputData(listNames[k], "list");
692 ListVector* list = input->getListVector();
693 string lastLabel = list->getLabel();
695 string filledInList = listNames[k] + "filledInTemp";
697 m->openOutputFile(filledInList, outFilled);
699 //for each label needed
700 for(int l = 0; l < orderFloat.size(); l++){
703 if (orderFloat[l] == -1) { thisLabel = "unique"; }
704 else { thisLabel = toString(orderFloat[l], length-1); }
706 //this file has reached the end
708 list = input->getListVector(lastLabel, true);
709 }else{ //do you have the distance, or do you need to fill in
712 if (list->getLabel() == "unique") { labelFloat = -1.0; }
713 else { convert(list->getLabel(), labelFloat); }
715 //check for missing labels
716 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
717 //if its bigger get last label, otherwise keep it
719 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
721 lastLabel = list->getLabel();
725 list->setLabel(thisLabel);
726 list->print(outFilled);
729 labelBin[orderFloat[l]] += list->getNumBins();
733 list = input->getListVector();
736 if (list != NULL) { delete list; }
740 remove(listNames[k].c_str());
741 rename(filledInList.c_str(), listNames[k].c_str());
746 catch(exception& e) {
747 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
751 //**********************************************************************************************************************
752 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
754 if (outputDir == "") { outputDir += m->hasPath(distfile); }
755 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
757 m->openOutputFile(fileroot+ tag + ".sabund", outSabund);
758 m->openOutputFile(fileroot+ tag + ".rabund", outRabund);
759 m->openOutputFile(fileroot+ tag + ".list", outList);
761 outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["list"].push_back(fileroot+ tag + ".list");
762 outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
763 outputNames.push_back(fileroot+ tag + ".list"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
765 map<float, int>::iterator itLabel;
767 //for each label needed
768 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
771 if (itLabel->first == -1) { thisLabel = "unique"; }
772 else { thisLabel = toString(itLabel->first, length-1); }
774 outList << thisLabel << '\t' << itLabel->second << '\t';
776 RAbundVector* rabund = new RAbundVector();
777 rabund->setLabel(thisLabel);
780 if (listSingle != NULL) {
781 for (int j = 0; j < listSingle->getNumBins(); j++) {
782 outList << listSingle->get(j) << '\t';
783 rabund->push_back(m->getNumNames(listSingle->get(j)));
787 //get the list info from each file
788 for (int k = 0; k < listNames.size(); k++) {
790 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); } delete rabund; return 0; }
792 InputData* input = new InputData(listNames[k], "list");
793 ListVector* list = input->getListVector(thisLabel);
795 //this file has reached the end
796 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
798 for (int j = 0; j < list->getNumBins(); j++) {
799 outList << list->get(j) << '\t';
800 rabund->push_back(m->getNumNames(list->get(j)));
807 SAbundVector sabund = rabund->getSAbundVector();
809 sabund.print(outSabund);
810 rabund->print(outRabund);
820 if (listSingle != NULL) { delete listSingle; }
822 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
826 catch(exception& e) {
827 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
832 //**********************************************************************************************************************
834 void ClusterSplitCommand::printData(ListVector* oldList){
836 string label = oldList->getLabel();
837 RAbundVector oldRAbund = oldList->getRAbundVector();
839 oldRAbund.setLabel(label);
840 if (m->isTrue(showabund)) {
841 oldRAbund.getSAbundVector().print(cout);
843 oldRAbund.print(outRabund);
844 oldRAbund.getSAbundVector().print(outSabund);
846 oldList->print(outList);
848 catch(exception& e) {
849 m->errorOut(e, "ClusterSplitCommand", "printData");
853 //**********************************************************************************************************************
854 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
857 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
862 //loop through and create all the processes you want
863 while (process != processors) {
867 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
871 vector<string> listFileNames = cluster(dividedNames[process], labels);
873 //write out names to file
874 string filename = toString(getpid()) + ".temp";
876 m->openOutputFile(filename, out);
878 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
883 filename = toString(getpid()) + ".temp.labels";
884 m->openOutputFile(filename, outLabels);
886 outLabels << cutoff << endl;
887 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
888 outLabels << (*it) << endl;
894 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
895 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
900 //force parent to wait until all the processes are done
901 for (int i=0;i<processors;i++) {
902 int temp = processIDS[i];
910 catch(exception& e) {
911 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
915 //**********************************************************************************************************************
917 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
920 SparseMatrix* matrix;
923 RAbundVector* rabund;
925 vector<string> listFileNames;
927 double smallestCutoff = cutoff;
929 //cluster each distance file
930 for (int i = 0; i < distNames.size(); i++) {
931 if (m->control_pressed) { return listFileNames; }
933 string thisNamefile = distNames[i].begin()->second;
934 string thisDistFile = distNames[i].begin()->first;
936 //read in distance file
937 globaldata->setNameFile(thisNamefile);
938 globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
942 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
944 //output your files too
946 cout << endl << "Reading " << thisDistFile << endl;
950 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
952 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
953 read->setCutoff(cutoff);
955 NameAssignment* nameMap = new NameAssignment(thisNamefile);
959 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
961 list = read->getListVector();
963 matrix = read->getMatrix();
970 //output your files too
972 cout << endl << "Clustering " << thisDistFile << endl;
976 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
978 rabund = new RAbundVector(list->getRAbundVector());
981 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
982 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
983 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
984 tag = cluster->getTag();
986 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
987 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
990 m->openOutputFile(fileroot+ tag + ".list", listFile);
992 listFileNames.push_back(fileroot+ tag + ".list");
994 float previousDist = 0.00000;
995 float rndPreviousDist = 0.00000;
1001 double saveCutoff = cutoff;
1003 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1005 if (m->control_pressed) { //clean up
1006 delete matrix; delete list; delete cluster; delete rabund;
1008 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1009 listFileNames.clear(); return listFileNames;
1012 cluster->update(saveCutoff);
1014 float dist = matrix->getSmallDist();
1017 rndDist = m->ceilDist(dist, precision);
1019 rndDist = m->roundDist(dist, precision);
1022 if(previousDist <= 0.0000 && dist != previousDist){
1023 oldList.setLabel("unique");
1024 oldList.print(listFile);
1025 if (labels.count("unique") == 0) { labels.insert("unique"); }
1027 else if(rndDist != rndPreviousDist){
1028 oldList.setLabel(toString(rndPreviousDist, length-1));
1029 oldList.print(listFile);
1030 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1033 previousDist = dist;
1034 rndPreviousDist = rndDist;
1039 if(previousDist <= 0.0000){
1040 oldList.setLabel("unique");
1041 oldList.print(listFile);
1042 if (labels.count("unique") == 0) { labels.insert("unique"); }
1044 else if(rndPreviousDist<cutoff){
1045 oldList.setLabel(toString(rndPreviousDist, length-1));
1046 oldList.print(listFile);
1047 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1050 delete matrix; delete list; delete cluster; delete rabund;
1053 if (m->control_pressed) { //clean up
1054 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1055 listFileNames.clear(); return listFileNames;
1058 remove(thisDistFile.c_str());
1059 remove(thisNamefile.c_str());
1061 if (saveCutoff != cutoff) {
1062 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1063 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1065 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1068 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1071 cutoff = smallestCutoff;
1073 return listFileNames;
1076 catch(exception& e) {
1077 m->errorOut(e, "ClusterSplitCommand", "cluster");
1083 //**********************************************************************************************************************
1085 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1090 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
1095 string thisOutputDir = outputDir;
1096 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
1097 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
1098 remove(outputFileName.c_str());
1101 for (int i = 0; i < distNames.size(); i++) {
1102 if (m->control_pressed) { return 0; }
1104 string thisDistFile = distNames[i].begin()->first;
1106 m->appendFiles(thisDistFile, outputFileName);
1109 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1119 catch(exception& e) {
1120 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1124 //**********************************************************************************************************************