2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
18 //**********************************************************************************************************************
19 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
20 ClusterSplitCommand::ClusterSplitCommand(string option) {
22 globaldata = GlobalData::getInstance();
26 //allow user to run help
27 if(option == "help") { help(); abort = true; }
30 //valid paramters for this command
31 string Array[] = {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
32 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
34 OptionParser parser(option);
35 map<string,string> parameters = parser.getParameters();
37 ValidParameters validParameter;
39 //check to make sure all parameters are valid for command
40 map<string,string>::iterator it;
41 for (it = parameters.begin(); it != parameters.end(); it++) {
42 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
47 globaldata->newRead();
49 //if the user changes the output directory command factory will send this info to us in the output parameter
50 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
52 //if the user changes the input directory command factory will send this info to us in the output parameter
53 string inputDir = validParameter.validFile(parameters, "inputdir", false);
54 if (inputDir == "not found"){ inputDir = ""; }
57 it = parameters.find("phylip");
58 //user has given a template file
59 if(it != parameters.end()){
60 path = hasPath(it->second);
61 //if the user has not given a path then, add inputdir. else leave path alone.
62 if (path == "") { parameters["phylip"] = inputDir + it->second; }
65 it = parameters.find("column");
66 //user has given a template file
67 if(it != parameters.end()){
68 path = hasPath(it->second);
69 //if the user has not given a path then, add inputdir. else leave path alone.
70 if (path == "") { parameters["column"] = inputDir + it->second; }
73 it = parameters.find("name");
74 //user has given a template file
75 if(it != parameters.end()){
76 path = hasPath(it->second);
77 //if the user has not given a path then, add inputdir. else leave path alone.
78 if (path == "") { parameters["name"] = inputDir + it->second; }
81 it = parameters.find("taxonomy");
82 //user has given a template file
83 if(it != parameters.end()){
84 path = hasPath(it->second);
85 //if the user has not given a path then, add inputdir. else leave path alone.
86 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
89 it = parameters.find("fasta");
90 //user has given a template file
91 if(it != parameters.end()){
92 path = hasPath(it->second);
93 //if the user has not given a path then, add inputdir. else leave path alone.
94 if (path == "") { parameters["fasta"] = inputDir + it->second; }
98 //check for required parameters
99 phylipfile = validParameter.validFile(parameters, "phylip", true);
100 if (phylipfile == "not open") { abort = true; }
101 else if (phylipfile == "not found") { phylipfile = ""; }
102 else { distfile = phylipfile; format = "phylip"; }
104 columnfile = validParameter.validFile(parameters, "column", true);
105 if (columnfile == "not open") { abort = true; }
106 else if (columnfile == "not found") { columnfile = ""; }
107 else { distfile = columnfile; format = "column"; }
109 namefile = validParameter.validFile(parameters, "name", true);
110 if (namefile == "not open") { abort = true; }
111 else if (namefile == "not found") { namefile = ""; }
113 fastafile = validParameter.validFile(parameters, "fasta", true);
114 if (fastafile == "not open") { abort = true; }
115 else if (fastafile == "not found") { fastafile = ""; }
116 else { distfile = fastafile; splitmethod = "fasta"; }
118 taxFile = validParameter.validFile(parameters, "taxonomy", true);
119 if (taxFile == "not open") { abort = true; }
120 else if (taxFile == "not found") { taxFile = ""; }
122 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
123 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
125 if (columnfile != "") {
126 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
129 if (fastafile != "") {
130 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
131 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
134 //check for optional parameter and set defaults
135 // ...at some point should added some additional type checking...
136 //get user cutoff and precision or use defaults
138 temp = validParameter.validFile(parameters, "precision", false);
139 if (temp == "not found") { temp = "100"; }
140 //saves precision legnth for formatting below
141 length = temp.length();
142 convert(temp, precision);
144 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
147 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
148 large = isTrue(temp);
150 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
151 convert(temp, processors);
153 temp = validParameter.validFile(parameters, "splitmethod", false);
154 if (splitmethod != "fasta") {
155 if (temp == "not found") { splitmethod = "distance"; }
156 else { splitmethod = temp; }
159 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
160 convert(temp, cutoff);
161 cutoff += (5 / (precision * 10.0));
163 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "1"; }
164 convert(temp, taxLevelCutoff);
166 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; }
168 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
169 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
171 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
172 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
174 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
176 showabund = validParameter.validFile(parameters, "showabund", false);
177 if (showabund == "not found") { showabund = "T"; }
179 timing = validParameter.validFile(parameters, "timing", false);
180 if (timing == "not found") { timing = "F"; }
184 catch(exception& e) {
185 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
190 //**********************************************************************************************************************
192 void ClusterSplitCommand::help(){
194 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
195 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
196 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
197 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n");
198 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and split the distance file based on those groups. \n");
199 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n");
200 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
201 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
202 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
203 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
204 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
205 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
206 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
207 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
208 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
209 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n");
210 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
212 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
214 m->mothurOut("The cluster.split command should be in the following format: \n");
215 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
216 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
219 catch(exception& e) {
220 m->errorOut(e, "ClusterSplitCommand", "help");
225 //**********************************************************************************************************************
227 ClusterSplitCommand::~ClusterSplitCommand(){}
229 //**********************************************************************************************************************
231 int ClusterSplitCommand::execute(){
234 if (abort == true) { return 0; }
237 vector<string> listFileNames;
239 string singletonName = "";
241 //****************** file prep work ******************************//
246 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
247 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
249 if (pid == 0) { //only process 0 converts and splits
253 //if user gave a phylip file convert to column file
254 if (format == "phylip") {
256 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
258 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
260 NameAssignment* nameMap = NULL;
261 convert->setFormat("phylip");
262 convert->read(nameMap);
264 if (m->control_pressed) { delete convert; return 0; }
266 distfile = convert->getOutputFile();
268 //if no names file given with phylip file, create it
269 ListVector* listToMakeNameFile = convert->getListVector();
270 if (namefile == "") { //you need to make a namefile for split matrix
272 namefile = phylipfile + ".names";
273 openOutputFile(namefile, out);
274 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
275 string bin = listToMakeNameFile->get(i);
276 out << bin << '\t' << bin << endl;
280 delete listToMakeNameFile;
283 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
285 if (m->control_pressed) { return 0; }
288 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
290 //split matrix into non-overlapping groups
292 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); }
293 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); }
294 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, splitmethod, processors, outputDir); }
295 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
299 if (m->control_pressed) { delete split; return 0; }
301 singletonName = split->getSingletonNames();
302 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
305 if (m->control_pressed) { return 0; }
307 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
310 //****************** break up files between processes and cluster each file set ******************************//
312 ////you are process 0 from above////
314 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
315 dividedNames.resize(processors);
317 //for each file group figure out which process will complete it
318 //want to divide the load intelligently so the big files are spread between processes
320 for (int i = 0; i < distName.size(); i++) {
321 int processToAssign = (i+1) % processors;
322 if (processToAssign == 0) { processToAssign = processors; }
324 dividedNames[(processToAssign-1)].push_back(distName[i]);
327 //not lets reverse the order of ever other process, so we balance big files running with little ones
328 for (int i = 0; i < processors; i++) {
329 int remainder = ((i+1) % processors);
330 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
334 //send each child the list of files it needs to process
335 for(int i = 1; i < processors; i++) {
336 //send number of file pairs
337 int num = dividedNames[i].size();
338 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
340 for (int j = 0; j < num; j++) { //send filenames to process i
341 char tempDistFileName[1024];
342 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
343 int lengthDist = (dividedNames[i][j].begin()->first).length();
345 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
346 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
348 char tempNameFileName[1024];
349 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
350 int lengthName = (dividedNames[i][j].begin()->second).length();
352 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
353 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
358 listFileNames = cluster(dividedNames[0], labels);
360 //receive the other processes info
361 for(int i = 1; i < processors; i++) {
362 int num = dividedNames[i].size();
364 //send list filenames to root process
365 for (int j = 0; j < num; j++) {
367 char tempListFileName[1024];
369 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
370 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
372 string myListFileName = tempListFileName;
373 myListFileName = myListFileName.substr(0, lengthList);
375 listFileNames.push_back(myListFileName);
378 //send Labels to root process
380 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
382 for (int j = 0; j < numLabels; j++) {
386 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
387 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
389 string myLabel = tempLabel;
390 myLabel = myLabel.substr(0, lengthLabel);
392 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
396 }else { //you are a child process
397 vector < map<string, string> > myNames;
399 //recieve the files you need to process
400 //receive number of file pairs
402 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
406 for (int j = 0; j < num; j++) { //receive filenames to process
408 char tempDistFileName[1024];
410 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
411 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
413 string myDistFileName = tempDistFileName;
414 myDistFileName = myDistFileName.substr(0, lengthDist);
417 char tempNameFileName[1024];
419 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
420 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
422 string myNameFileName = tempNameFileName;
423 myNameFileName = myNameFileName.substr(0, lengthName);
426 myNames[j][myDistFileName] = myNameFileName;
430 listFileNames = cluster(myNames, labels);
432 //send list filenames to root process
433 for (int j = 0; j < num; j++) {
434 char tempListFileName[1024];
435 strcpy(tempListFileName, listFileNames[j].c_str());
436 int lengthList = listFileNames[j].length();
438 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
439 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
442 //send Labels to root process
443 int numLabels = labels.size();
444 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
446 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
448 strcpy(tempLabel, (*it).c_str());
449 int lengthLabel = (*it).length();
451 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
452 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
457 MPI_Barrier(MPI_COMM_WORLD);
461 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
463 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
465 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
466 dividedNames.resize(processors);
468 //for each file group figure out which process will complete it
469 //want to divide the load intelligently so the big files are spread between processes
471 for (int i = 0; i < distName.size(); i++) {
472 int processToAssign = (i+1) % processors;
473 if (processToAssign == 0) { processToAssign = processors; }
475 dividedNames[(processToAssign-1)].push_back(distName[i]);
478 //not lets reverse the order of ever other process, so we balance big files running with little ones
479 for (int i = 0; i < processors; i++) {
480 int remainder = ((i+1) % processors);
481 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
484 createProcesses(dividedNames);
486 if (m->control_pressed) { return 0; }
488 //get list of list file names from each process
489 for(int i=0;i<processors;i++){
490 string filename = toString(processIDS[i]) + ".temp";
492 openInputFile(filename, in);
494 in >> tag; gobble(in);
498 in >> tempName; gobble(in);
499 listFileNames.push_back(tempName);
502 remove((toString(processIDS[i]) + ".temp").c_str());
505 filename = toString(processIDS[i]) + ".temp.labels";
507 openInputFile(filename, in2);
511 in2 >> tempName; gobble(in2);
512 if (labels.count(tempName) == 0) { labels.insert(tempName); }
515 remove((toString(processIDS[i]) + ".temp.labels").c_str());
519 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
522 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
524 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
526 //****************** merge list file and create rabund and sabund files ******************************//
528 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
531 if (pid == 0) { //only process 0 merges
534 ListVector* listSingle;
535 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
537 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
539 mergeLists(listFileNames, labelBins, listSingle);
541 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
543 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
545 m->mothurOutEndLine();
546 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
547 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
548 m->mothurOutEndLine();
551 } //only process 0 merges
554 MPI_Barrier(MPI_COMM_WORLD);
559 catch(exception& e) {
560 m->errorOut(e, "ClusterSplitCommand", "execute");
564 //**********************************************************************************************************************
565 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
568 map<float, int> labelBin;
569 vector<float> orderFloat;
573 if (singleton != "none") {
575 openInputFile(singleton, in);
577 string firstCol, secondCol;
578 listSingle = new ListVector();
580 in >> firstCol >> secondCol; gobble(in);
581 listSingle->push_back(secondCol);
584 remove(singleton.c_str());
586 numSingleBins = listSingle->getNumBins();
587 }else{ listSingle = NULL; numSingleBins = 0; }
589 //go through users set and make them floats so we can sort them
590 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
593 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
594 else if (*it == "unique") { temp = -1.0; }
596 orderFloat.push_back(temp);
597 labelBin[temp] = numSingleBins; //initialize numbins
601 sort(orderFloat.begin(), orderFloat.end());
604 //get the list info from each file
605 for (int k = 0; k < listNames.size(); k++) {
607 if (m->control_pressed) {
608 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str()); }
609 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
613 InputData* input = new InputData(listNames[k], "list");
614 ListVector* list = input->getListVector();
615 string lastLabel = list->getLabel();
617 string filledInList = listNames[k] + "filledInTemp";
619 openOutputFile(filledInList, outFilled);
621 //for each label needed
622 for(int l = 0; l < orderFloat.size(); l++){
625 if (orderFloat[l] == -1) { thisLabel = "unique"; }
626 else { thisLabel = toString(orderFloat[l], length-1); }
628 //this file has reached the end
630 list = input->getListVector(lastLabel, true);
631 }else{ //do you have the distance, or do you need to fill in
634 if (list->getLabel() == "unique") { labelFloat = -1.0; }
635 else { convert(list->getLabel(), labelFloat); }
637 //check for missing labels
638 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
639 //if its bigger get last label, otherwise keep it
641 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
643 lastLabel = list->getLabel();
647 list->setLabel(thisLabel);
648 list->print(outFilled);
651 labelBin[orderFloat[l]] += list->getNumBins();
655 list = input->getListVector();
658 if (list != NULL) { delete list; }
662 remove(listNames[k].c_str());
663 rename(filledInList.c_str(), listNames[k].c_str());
668 catch(exception& e) {
669 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
673 //**********************************************************************************************************************
674 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
676 if (outputDir == "") { outputDir += hasPath(distfile); }
677 fileroot = outputDir + getRootName(getSimpleName(distfile));
679 openOutputFile(fileroot+ tag + ".sabund", outSabund);
680 openOutputFile(fileroot+ tag + ".rabund", outRabund);
681 openOutputFile(fileroot+ tag + ".list", outList);
683 outputNames.push_back(fileroot+ tag + ".sabund");
684 outputNames.push_back(fileroot+ tag + ".rabund");
685 outputNames.push_back(fileroot+ tag + ".list");
687 map<float, int>::iterator itLabel;
689 //for each label needed
690 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
693 if (itLabel->first == -1) { thisLabel = "unique"; }
694 else { thisLabel = toString(itLabel->first, length-1); }
696 outList << thisLabel << '\t' << itLabel->second << '\t';
698 RAbundVector* rabund = new RAbundVector();
699 rabund->setLabel(thisLabel);
702 if (listSingle != NULL) {
703 for (int j = 0; j < listSingle->getNumBins(); j++) {
704 outList << listSingle->get(j) << '\t';
705 rabund->push_back(getNumNames(listSingle->get(j)));
709 //get the list info from each file
710 for (int k = 0; k < listNames.size(); k++) {
712 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); } delete rabund; return 0; }
714 InputData* input = new InputData(listNames[k], "list");
715 ListVector* list = input->getListVector(thisLabel);
717 //this file has reached the end
718 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
720 for (int j = 0; j < list->getNumBins(); j++) {
721 outList << list->get(j) << '\t';
722 rabund->push_back(getNumNames(list->get(j)));
729 SAbundVector sabund = rabund->getSAbundVector();
731 sabund.print(outSabund);
732 rabund->print(outRabund);
742 if (listSingle != NULL) { delete listSingle; }
744 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
748 catch(exception& e) {
749 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
754 //**********************************************************************************************************************
756 void ClusterSplitCommand::printData(ListVector* oldList){
758 string label = oldList->getLabel();
759 RAbundVector oldRAbund = oldList->getRAbundVector();
761 oldRAbund.setLabel(label);
762 if (isTrue(showabund)) {
763 oldRAbund.getSAbundVector().print(cout);
765 oldRAbund.print(outRabund);
766 oldRAbund.getSAbundVector().print(outSabund);
768 oldList->print(outList);
770 catch(exception& e) {
771 m->errorOut(e, "ClusterSplitCommand", "printData");
775 //**********************************************************************************************************************
776 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
779 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
784 //loop through and create all the processes you want
785 while (process != processors) {
789 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
793 vector<string> listFileNames = cluster(dividedNames[process], labels);
795 //write out names to file
796 string filename = toString(getpid()) + ".temp";
798 openOutputFile(filename, out);
800 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
805 filename = toString(getpid()) + ".temp.labels";
806 openOutputFile(filename, outLabels);
808 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
809 outLabels << (*it) << endl;
814 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
817 //force parent to wait until all the processes are done
818 for (int i=0;i<processors;i++) {
819 int temp = processIDS[i];
827 catch(exception& e) {
828 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
832 //**********************************************************************************************************************
834 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
837 SparseMatrix* matrix;
840 RAbundVector* rabund;
842 vector<string> listFileNames;
844 //cluster each distance file
845 for (int i = 0; i < distNames.size(); i++) {
847 string thisNamefile = distNames[i].begin()->second;
848 string thisDistFile = distNames[i].begin()->first;
850 //read in distance file
851 globaldata->setNameFile(thisNamefile);
852 globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
856 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
858 //output your files too
860 cout << endl << "Reading " << thisDistFile << endl;
864 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
866 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
867 read->setCutoff(cutoff);
869 NameAssignment* nameMap = new NameAssignment(thisNamefile);
873 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
875 list = read->getListVector();
877 matrix = read->getMatrix();
884 //output your files too
886 cout << endl << "Clustering " << thisDistFile << endl;
890 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
892 rabund = new RAbundVector(list->getRAbundVector());
895 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
896 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
897 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
898 tag = cluster->getTag();
900 if (outputDir == "") { outputDir += hasPath(thisDistFile); }
901 fileroot = outputDir + getRootName(getSimpleName(thisDistFile));
904 openOutputFile(fileroot+ tag + ".list", listFile);
906 listFileNames.push_back(fileroot+ tag + ".list");
908 time_t estart = time(NULL);
910 float previousDist = 0.00000;
911 float rndPreviousDist = 0.00000;
917 double saveCutoff = cutoff;
919 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
921 if (m->control_pressed) { //clean up
922 delete matrix; delete list; delete cluster; delete rabund;
924 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
925 listFileNames.clear(); return listFileNames;
928 cluster->update(cutoff);
930 float dist = matrix->getSmallDist();
933 rndDist = ceilDist(dist, precision);
935 rndDist = roundDist(dist, precision);
938 if(previousDist <= 0.0000 && dist != previousDist){
939 oldList.setLabel("unique");
940 oldList.print(listFile);
941 if (labels.count("unique") == 0) { labels.insert("unique"); }
943 else if(rndDist != rndPreviousDist){
944 oldList.setLabel(toString(rndPreviousDist, length-1));
945 oldList.print(listFile);
946 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
950 rndPreviousDist = rndDist;
955 if(previousDist <= 0.0000){
956 oldList.setLabel("unique");
957 oldList.print(listFile);
958 if (labels.count("unique") == 0) { labels.insert("unique"); }
960 else if(rndPreviousDist<cutoff){
961 oldList.setLabel(toString(rndPreviousDist, length-1));
962 oldList.print(listFile);
963 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
966 delete matrix; delete list; delete cluster; delete rabund;
969 if (m->control_pressed) { //clean up
970 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
971 listFileNames.clear(); return listFileNames;
974 remove(thisDistFile.c_str());
975 remove(thisNamefile.c_str());
979 return listFileNames;
982 catch(exception& e) {
983 m->errorOut(e, "ClusterSplitCommand", "cluster");
990 //**********************************************************************************************************************