2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
18 //**********************************************************************************************************************
19 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
20 ClusterSplitCommand::ClusterSplitCommand(string option) {
22 globaldata = GlobalData::getInstance();
26 //allow user to run help
27 if(option == "help") { help(); abort = true; }
30 //valid paramters for this command
31 string Array[] = {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
32 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
34 OptionParser parser(option);
35 map<string,string> parameters = parser.getParameters();
37 ValidParameters validParameter;
39 //check to make sure all parameters are valid for command
40 map<string,string>::iterator it;
41 for (it = parameters.begin(); it != parameters.end(); it++) {
42 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
47 globaldata->newRead();
49 //if the user changes the output directory command factory will send this info to us in the output parameter
50 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
52 //if the user changes the input directory command factory will send this info to us in the output parameter
53 string inputDir = validParameter.validFile(parameters, "inputdir", false);
54 if (inputDir == "not found"){ inputDir = ""; }
57 it = parameters.find("phylip");
58 //user has given a template file
59 if(it != parameters.end()){
60 path = hasPath(it->second);
61 //if the user has not given a path then, add inputdir. else leave path alone.
62 if (path == "") { parameters["phylip"] = inputDir + it->second; }
65 it = parameters.find("column");
66 //user has given a template file
67 if(it != parameters.end()){
68 path = hasPath(it->second);
69 //if the user has not given a path then, add inputdir. else leave path alone.
70 if (path == "") { parameters["column"] = inputDir + it->second; }
73 it = parameters.find("name");
74 //user has given a template file
75 if(it != parameters.end()){
76 path = hasPath(it->second);
77 //if the user has not given a path then, add inputdir. else leave path alone.
78 if (path == "") { parameters["name"] = inputDir + it->second; }
81 it = parameters.find("taxonomy");
82 //user has given a template file
83 if(it != parameters.end()){
84 path = hasPath(it->second);
85 //if the user has not given a path then, add inputdir. else leave path alone.
86 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
89 it = parameters.find("fasta");
90 //user has given a template file
91 if(it != parameters.end()){
92 path = hasPath(it->second);
93 //if the user has not given a path then, add inputdir. else leave path alone.
94 if (path == "") { parameters["fasta"] = inputDir + it->second; }
98 //check for required parameters
99 phylipfile = validParameter.validFile(parameters, "phylip", true);
100 if (phylipfile == "not open") { abort = true; }
101 else if (phylipfile == "not found") { phylipfile = ""; }
102 else { distfile = phylipfile; format = "phylip"; }
104 columnfile = validParameter.validFile(parameters, "column", true);
105 if (columnfile == "not open") { abort = true; }
106 else if (columnfile == "not found") { columnfile = ""; }
107 else { distfile = columnfile; format = "column"; }
109 namefile = validParameter.validFile(parameters, "name", true);
110 if (namefile == "not open") { abort = true; }
111 else if (namefile == "not found") { namefile = ""; }
113 fastafile = validParameter.validFile(parameters, "fasta", true);
114 if (fastafile == "not open") { abort = true; }
115 else if (fastafile == "not found") { fastafile = ""; }
116 else { distfile = fastafile; splitmethod = "fasta"; }
118 taxFile = validParameter.validFile(parameters, "taxonomy", true);
119 if (taxFile == "not open") { abort = true; }
120 else if (taxFile == "not found") { taxFile = ""; }
122 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
123 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
125 if (columnfile != "") {
126 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
129 if (fastafile != "") {
130 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
131 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
134 //check for optional parameter and set defaults
135 // ...at some point should added some additional type checking...
136 //get user cutoff and precision or use defaults
138 temp = validParameter.validFile(parameters, "precision", false);
139 if (temp == "not found") { temp = "100"; }
140 //saves precision legnth for formatting below
141 length = temp.length();
142 convert(temp, precision);
144 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
147 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
148 large = isTrue(temp);
150 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
151 convert(temp, processors);
153 temp = validParameter.validFile(parameters, "splitmethod", false);
154 if (splitmethod != "fasta") {
155 if (temp == "not found") { splitmethod = "distance"; }
156 else { splitmethod = temp; }
159 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
160 convert(temp, cutoff);
161 cutoff += (5 / (precision * 10.0));
163 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "1"; }
164 convert(temp, taxLevelCutoff);
166 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; }
168 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
169 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
171 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
172 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
174 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
176 showabund = validParameter.validFile(parameters, "showabund", false);
177 if (showabund == "not found") { showabund = "T"; }
179 timing = validParameter.validFile(parameters, "timing", false);
180 if (timing == "not found") { timing = "F"; }
184 catch(exception& e) {
185 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
190 //**********************************************************************************************************************
192 void ClusterSplitCommand::help(){
194 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
195 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
196 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
197 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n");
198 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and split the distance file based on those groups. \n");
199 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n");
200 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
201 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
202 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
203 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
204 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
205 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
206 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
207 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
208 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
209 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n");
210 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
212 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
214 m->mothurOut("The cluster.split command should be in the following format: \n");
215 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
216 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
219 catch(exception& e) {
220 m->errorOut(e, "ClusterSplitCommand", "help");
225 //**********************************************************************************************************************
227 ClusterSplitCommand::~ClusterSplitCommand(){}
229 //**********************************************************************************************************************
231 int ClusterSplitCommand::execute(){
234 if (abort == true) { return 0; }
237 vector<string> listFileNames;
239 string singletonName = "";
240 double saveCutoff = cutoff;
242 //****************** file prep work ******************************//
247 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
248 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
250 if (pid == 0) { //only process 0 converts and splits
254 //if user gave a phylip file convert to column file
255 if (format == "phylip") {
257 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
259 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
261 NameAssignment* nameMap = NULL;
262 convert->setFormat("phylip");
263 convert->read(nameMap);
265 if (m->control_pressed) { delete convert; return 0; }
267 distfile = convert->getOutputFile();
269 //if no names file given with phylip file, create it
270 ListVector* listToMakeNameFile = convert->getListVector();
271 if (namefile == "") { //you need to make a namefile for split matrix
273 namefile = phylipfile + ".names";
274 openOutputFile(namefile, out);
275 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
276 string bin = listToMakeNameFile->get(i);
277 out << bin << '\t' << bin << endl;
281 delete listToMakeNameFile;
284 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
286 if (m->control_pressed) { return 0; }
289 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
291 //split matrix into non-overlapping groups
293 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); }
294 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); }
295 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, splitmethod, processors, outputDir); }
296 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
300 if (m->control_pressed) { delete split; return 0; }
302 singletonName = split->getSingletonNames();
303 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
306 if (m->control_pressed) { return 0; }
308 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
311 //****************** break up files between processes and cluster each file set ******************************//
313 ////you are process 0 from above////
315 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
316 dividedNames.resize(processors);
318 //for each file group figure out which process will complete it
319 //want to divide the load intelligently so the big files are spread between processes
321 for (int i = 0; i < distName.size(); i++) {
322 int processToAssign = (i+1) % processors;
323 if (processToAssign == 0) { processToAssign = processors; }
325 dividedNames[(processToAssign-1)].push_back(distName[i]);
328 //not lets reverse the order of ever other process, so we balance big files running with little ones
329 for (int i = 0; i < processors; i++) {
330 int remainder = ((i+1) % processors);
331 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
335 //send each child the list of files it needs to process
336 for(int i = 1; i < processors; i++) {
337 //send number of file pairs
338 int num = dividedNames[i].size();
339 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
341 for (int j = 0; j < num; j++) { //send filenames to process i
342 char tempDistFileName[1024];
343 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
344 int lengthDist = (dividedNames[i][j].begin()->first).length();
346 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
347 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
349 char tempNameFileName[1024];
350 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
351 int lengthName = (dividedNames[i][j].begin()->second).length();
353 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
354 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
359 listFileNames = cluster(dividedNames[0], labels);
361 //receive the other processes info
362 for(int i = 1; i < processors; i++) {
363 int num = dividedNames[i].size();
366 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
367 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
369 //send list filenames to root process
370 for (int j = 0; j < num; j++) {
372 char tempListFileName[1024];
374 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
375 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
377 string myListFileName = tempListFileName;
378 myListFileName = myListFileName.substr(0, lengthList);
380 listFileNames.push_back(myListFileName);
383 //send Labels to root process
385 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
387 for (int j = 0; j < numLabels; j++) {
391 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
392 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
394 string myLabel = tempLabel;
395 myLabel = myLabel.substr(0, lengthLabel);
397 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
401 }else { //you are a child process
402 vector < map<string, string> > myNames;
404 //recieve the files you need to process
405 //receive number of file pairs
407 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
411 for (int j = 0; j < num; j++) { //receive filenames to process
413 char tempDistFileName[1024];
415 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
416 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
418 string myDistFileName = tempDistFileName;
419 myDistFileName = myDistFileName.substr(0, lengthDist);
422 char tempNameFileName[1024];
424 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
425 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
427 string myNameFileName = tempNameFileName;
428 myNameFileName = myNameFileName.substr(0, lengthName);
431 myNames[j][myDistFileName] = myNameFileName;
435 listFileNames = cluster(myNames, labels);
438 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
440 //send list filenames to root process
441 for (int j = 0; j < num; j++) {
442 char tempListFileName[1024];
443 strcpy(tempListFileName, listFileNames[j].c_str());
444 int lengthList = listFileNames[j].length();
446 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
447 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
450 //send Labels to root process
451 int numLabels = labels.size();
452 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
454 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
456 strcpy(tempLabel, (*it).c_str());
457 int lengthLabel = (*it).length();
459 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
460 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
465 MPI_Barrier(MPI_COMM_WORLD);
469 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
471 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
473 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
474 dividedNames.resize(processors);
476 //for each file group figure out which process will complete it
477 //want to divide the load intelligently so the big files are spread between processes
479 for (int i = 0; i < distName.size(); i++) {
480 int processToAssign = (i+1) % processors;
481 if (processToAssign == 0) { processToAssign = processors; }
483 dividedNames[(processToAssign-1)].push_back(distName[i]);
486 //not lets reverse the order of ever other process, so we balance big files running with little ones
487 for (int i = 0; i < processors; i++) {
488 int remainder = ((i+1) % processors);
489 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
492 createProcesses(dividedNames);
494 if (m->control_pressed) { return 0; }
496 //get list of list file names from each process
497 for(int i=0;i<processors;i++){
498 string filename = toString(processIDS[i]) + ".temp";
500 openInputFile(filename, in);
502 in >> tag; gobble(in);
506 in >> tempName; gobble(in);
507 listFileNames.push_back(tempName);
510 remove((toString(processIDS[i]) + ".temp").c_str());
513 filename = toString(processIDS[i]) + ".temp.labels";
515 openInputFile(filename, in2);
518 in2 >> tempCutoff; gobble(in2);
519 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
523 in2 >> tempName; gobble(in2);
524 if (labels.count(tempName) == 0) { labels.insert(tempName); }
527 remove((toString(processIDS[i]) + ".temp.labels").c_str());
531 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
534 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
536 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
538 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
540 //****************** merge list file and create rabund and sabund files ******************************//
542 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
545 if (pid == 0) { //only process 0 merges
548 ListVector* listSingle;
549 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
551 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
553 mergeLists(listFileNames, labelBins, listSingle);
555 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
557 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
559 m->mothurOutEndLine();
560 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
561 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
562 m->mothurOutEndLine();
565 } //only process 0 merges
568 MPI_Barrier(MPI_COMM_WORLD);
573 catch(exception& e) {
574 m->errorOut(e, "ClusterSplitCommand", "execute");
578 //**********************************************************************************************************************
579 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
582 map<float, int> labelBin;
583 vector<float> orderFloat;
587 if (singleton != "none") {
589 openInputFile(singleton, in);
591 string firstCol, secondCol;
592 listSingle = new ListVector();
594 in >> firstCol >> secondCol; gobble(in);
595 listSingle->push_back(secondCol);
598 remove(singleton.c_str());
600 numSingleBins = listSingle->getNumBins();
601 }else{ listSingle = NULL; numSingleBins = 0; }
603 //go through users set and make them floats so we can sort them
604 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
607 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
608 else if (*it == "unique") { temp = -1.0; }
610 if (temp <= cutoff) {
611 orderFloat.push_back(temp);
612 labelBin[temp] = numSingleBins; //initialize numbins
617 sort(orderFloat.begin(), orderFloat.end());
620 //get the list info from each file
621 for (int k = 0; k < listNames.size(); k++) {
623 if (m->control_pressed) {
624 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str()); }
625 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
629 InputData* input = new InputData(listNames[k], "list");
630 ListVector* list = input->getListVector();
631 string lastLabel = list->getLabel();
633 string filledInList = listNames[k] + "filledInTemp";
635 openOutputFile(filledInList, outFilled);
637 //for each label needed
638 for(int l = 0; l < orderFloat.size(); l++){
641 if (orderFloat[l] == -1) { thisLabel = "unique"; }
642 else { thisLabel = toString(orderFloat[l], length-1); }
644 //this file has reached the end
646 list = input->getListVector(lastLabel, true);
647 }else{ //do you have the distance, or do you need to fill in
650 if (list->getLabel() == "unique") { labelFloat = -1.0; }
651 else { convert(list->getLabel(), labelFloat); }
653 //check for missing labels
654 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
655 //if its bigger get last label, otherwise keep it
657 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
659 lastLabel = list->getLabel();
663 list->setLabel(thisLabel);
664 list->print(outFilled);
667 labelBin[orderFloat[l]] += list->getNumBins();
671 list = input->getListVector();
674 if (list != NULL) { delete list; }
678 remove(listNames[k].c_str());
679 rename(filledInList.c_str(), listNames[k].c_str());
684 catch(exception& e) {
685 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
689 //**********************************************************************************************************************
690 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
692 if (outputDir == "") { outputDir += hasPath(distfile); }
693 fileroot = outputDir + getRootName(getSimpleName(distfile));
695 openOutputFile(fileroot+ tag + ".sabund", outSabund);
696 openOutputFile(fileroot+ tag + ".rabund", outRabund);
697 openOutputFile(fileroot+ tag + ".list", outList);
699 outputNames.push_back(fileroot+ tag + ".sabund");
700 outputNames.push_back(fileroot+ tag + ".rabund");
701 outputNames.push_back(fileroot+ tag + ".list");
703 map<float, int>::iterator itLabel;
705 //for each label needed
706 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
709 if (itLabel->first == -1) { thisLabel = "unique"; }
710 else { thisLabel = toString(itLabel->first, length-1); }
712 outList << thisLabel << '\t' << itLabel->second << '\t';
714 RAbundVector* rabund = new RAbundVector();
715 rabund->setLabel(thisLabel);
718 if (listSingle != NULL) {
719 for (int j = 0; j < listSingle->getNumBins(); j++) {
720 outList << listSingle->get(j) << '\t';
721 rabund->push_back(getNumNames(listSingle->get(j)));
725 //get the list info from each file
726 for (int k = 0; k < listNames.size(); k++) {
728 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); } delete rabund; return 0; }
730 InputData* input = new InputData(listNames[k], "list");
731 ListVector* list = input->getListVector(thisLabel);
733 //this file has reached the end
734 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
736 for (int j = 0; j < list->getNumBins(); j++) {
737 outList << list->get(j) << '\t';
738 rabund->push_back(getNumNames(list->get(j)));
745 SAbundVector sabund = rabund->getSAbundVector();
747 sabund.print(outSabund);
748 rabund->print(outRabund);
758 if (listSingle != NULL) { delete listSingle; }
760 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
764 catch(exception& e) {
765 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
770 //**********************************************************************************************************************
772 void ClusterSplitCommand::printData(ListVector* oldList){
774 string label = oldList->getLabel();
775 RAbundVector oldRAbund = oldList->getRAbundVector();
777 oldRAbund.setLabel(label);
778 if (isTrue(showabund)) {
779 oldRAbund.getSAbundVector().print(cout);
781 oldRAbund.print(outRabund);
782 oldRAbund.getSAbundVector().print(outSabund);
784 oldList->print(outList);
786 catch(exception& e) {
787 m->errorOut(e, "ClusterSplitCommand", "printData");
791 //**********************************************************************************************************************
792 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
795 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
800 //loop through and create all the processes you want
801 while (process != processors) {
805 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
809 vector<string> listFileNames = cluster(dividedNames[process], labels);
811 //write out names to file
812 string filename = toString(getpid()) + ".temp";
814 openOutputFile(filename, out);
816 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
821 filename = toString(getpid()) + ".temp.labels";
822 openOutputFile(filename, outLabels);
824 outLabels << cutoff << endl;
825 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
826 outLabels << (*it) << endl;
831 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
834 //force parent to wait until all the processes are done
835 for (int i=0;i<processors;i++) {
836 int temp = processIDS[i];
844 catch(exception& e) {
845 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
849 //**********************************************************************************************************************
851 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
854 SparseMatrix* matrix;
857 RAbundVector* rabund;
859 vector<string> listFileNames;
861 double smallestCutoff = cutoff;
863 //cluster each distance file
864 for (int i = 0; i < distNames.size(); i++) {
865 if (m->control_pressed) { return listFileNames; }
867 string thisNamefile = distNames[i].begin()->second;
868 string thisDistFile = distNames[i].begin()->first;
870 //read in distance file
871 globaldata->setNameFile(thisNamefile);
872 globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
876 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
878 //output your files too
880 cout << endl << "Reading " << thisDistFile << endl;
884 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
886 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
887 read->setCutoff(cutoff);
889 NameAssignment* nameMap = new NameAssignment(thisNamefile);
893 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
895 list = read->getListVector();
897 matrix = read->getMatrix();
904 //output your files too
906 cout << endl << "Clustering " << thisDistFile << endl;
910 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
912 rabund = new RAbundVector(list->getRAbundVector());
915 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
916 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
917 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
918 tag = cluster->getTag();
920 if (outputDir == "") { outputDir += hasPath(thisDistFile); }
921 fileroot = outputDir + getRootName(getSimpleName(thisDistFile));
924 openOutputFile(fileroot+ tag + ".list", listFile);
926 listFileNames.push_back(fileroot+ tag + ".list");
928 time_t estart = time(NULL);
930 float previousDist = 0.00000;
931 float rndPreviousDist = 0.00000;
937 double saveCutoff = cutoff;
939 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
941 if (m->control_pressed) { //clean up
942 delete matrix; delete list; delete cluster; delete rabund;
944 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
945 listFileNames.clear(); return listFileNames;
948 cluster->update(saveCutoff);
950 float dist = matrix->getSmallDist();
953 rndDist = ceilDist(dist, precision);
955 rndDist = roundDist(dist, precision);
958 if(previousDist <= 0.0000 && dist != previousDist){
959 oldList.setLabel("unique");
960 oldList.print(listFile);
961 if (labels.count("unique") == 0) { labels.insert("unique"); }
963 else if(rndDist != rndPreviousDist){
964 oldList.setLabel(toString(rndPreviousDist, length-1));
965 oldList.print(listFile);
966 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
970 rndPreviousDist = rndDist;
975 if(previousDist <= 0.0000){
976 oldList.setLabel("unique");
977 oldList.print(listFile);
978 if (labels.count("unique") == 0) { labels.insert("unique"); }
980 else if(rndPreviousDist<cutoff){
981 oldList.setLabel(toString(rndPreviousDist, length-1));
982 oldList.print(listFile);
983 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
986 delete matrix; delete list; delete cluster; delete rabund;
989 if (m->control_pressed) { //clean up
990 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
991 listFileNames.clear(); return listFileNames;
994 remove(thisDistFile.c_str());
995 remove(thisNamefile.c_str());
997 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine(); }
999 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1002 cutoff = smallestCutoff;
1004 return listFileNames;
1007 catch(exception& e) {
1008 m->errorOut(e, "ClusterSplitCommand", "cluster");
1015 //**********************************************************************************************************************