2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
18 //**********************************************************************************************************************
19 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
20 ClusterSplitCommand::ClusterSplitCommand(string option) {
22 globaldata = GlobalData::getInstance();
25 //allow user to run help
26 if(option == "help") { help(); abort = true; }
29 //valid paramters for this command
30 string Array[] = {"phylip","column","name","cutoff","precision","method","showabund","timing","hard","processors","outputdir","inputdir"};
31 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
33 OptionParser parser(option);
34 map<string,string> parameters = parser.getParameters();
36 ValidParameters validParameter;
38 //check to make sure all parameters are valid for command
39 map<string,string>::iterator it;
40 for (it = parameters.begin(); it != parameters.end(); it++) {
41 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
46 globaldata->newRead();
48 //if the user changes the output directory command factory will send this info to us in the output parameter
49 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
51 //if the user changes the input directory command factory will send this info to us in the output parameter
52 string inputDir = validParameter.validFile(parameters, "inputdir", false);
53 if (inputDir == "not found"){ inputDir = ""; }
56 it = parameters.find("phylip");
57 //user has given a template file
58 if(it != parameters.end()){
59 path = hasPath(it->second);
60 //if the user has not given a path then, add inputdir. else leave path alone.
61 if (path == "") { parameters["phylip"] = inputDir + it->second; }
64 it = parameters.find("column");
65 //user has given a template file
66 if(it != parameters.end()){
67 path = hasPath(it->second);
68 //if the user has not given a path then, add inputdir. else leave path alone.
69 if (path == "") { parameters["column"] = inputDir + it->second; }
72 it = parameters.find("name");
73 //user has given a template file
74 if(it != parameters.end()){
75 path = hasPath(it->second);
76 //if the user has not given a path then, add inputdir. else leave path alone.
77 if (path == "") { parameters["name"] = inputDir + it->second; }
81 //check for required parameters
82 phylipfile = validParameter.validFile(parameters, "phylip", true);
83 if (phylipfile == "not open") { abort = true; }
84 else if (phylipfile == "not found") { phylipfile = ""; }
85 else { distfile = phylipfile; format = "phylip"; }
87 columnfile = validParameter.validFile(parameters, "column", true);
88 if (columnfile == "not open") { abort = true; }
89 else if (columnfile == "not found") { columnfile = ""; }
90 else { distfile = columnfile; format = "column"; }
92 namefile = validParameter.validFile(parameters, "name", true);
93 if (namefile == "not open") { abort = true; }
94 else if (namefile == "not found") { namefile = ""; }
96 if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
97 else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
99 if (columnfile != "") {
100 if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }
103 //check for optional parameter and set defaults
104 // ...at some point should added some additional type checking...
105 //get user cutoff and precision or use defaults
107 temp = validParameter.validFile(parameters, "precision", false);
108 if (temp == "not found") { temp = "100"; }
109 //saves precision legnth for formatting below
110 length = temp.length();
111 convert(temp, precision);
113 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
116 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
117 convert(temp, processors);
119 temp = validParameter.validFile(parameters, "cutoff", false);
120 if (temp == "not found") { temp = "10"; }
121 convert(temp, cutoff);
122 if (!hard) { cutoff += (5 / (precision * 10.0)); }
124 method = validParameter.validFile(parameters, "method", false);
125 if (method == "not found") { method = "furthest"; }
127 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
128 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
130 showabund = validParameter.validFile(parameters, "showabund", false);
131 if (showabund == "not found") { showabund = "T"; }
133 timing = validParameter.validFile(parameters, "timing", false);
134 if (timing == "not found") { timing = "F"; }
138 catch(exception& e) {
139 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
144 //**********************************************************************************************************************
146 void ClusterSplitCommand::help(){
148 m->mothurOut("The cluster command can only be executed after a successful read.dist command.\n");
149 m->mothurOut("The cluster command parameter options are method, cuttoff, hard, precision, showabund and timing. No parameters are required.\n");
150 m->mothurOut("The cluster command should be in the following format: \n");
151 m->mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
152 m->mothurOut("The acceptable cluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
154 catch(exception& e) {
155 m->errorOut(e, "ClusterSplitCommand", "help");
160 //**********************************************************************************************************************
162 ClusterSplitCommand::~ClusterSplitCommand(){}
164 //**********************************************************************************************************************
166 int ClusterSplitCommand::execute(){
169 if (abort == true) { return 0; }
171 //****************** file prep work ******************************//
173 //if user gave a phylip file convert to column file
174 if (format == "phylip") {
176 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
178 NameAssignment* nameMap = NULL;
179 convert->setFormat("phylip");
180 convert->read(nameMap);
182 if (m->control_pressed) { delete convert; return 0; }
184 distfile = convert->getOutputFile();
186 //if no names file given with phylip file, create it
187 ListVector* listToMakeNameFile = convert->getListVector();
188 if (namefile == "") { //you need to make a namefile for split matrix
190 namefile = phylipfile + ".names";
191 openOutputFile(namefile, out);
192 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
193 string bin = listToMakeNameFile->get(i);
194 out << bin << '\t' << bin << endl;
198 delete listToMakeNameFile;
201 if (m->control_pressed) { return 0; }
203 time_t estart = time(NULL);
205 //split matrix into non-overlapping groups
206 SplitMatrix* split = new SplitMatrix(distfile, namefile, cutoff);
209 if (m->control_pressed) { delete split; return 0; }
211 string singletonName = split->getSingletonNames();
212 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
215 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
218 if (m->control_pressed) { return 0; }
220 //****************** break up files between processes and cluster each file set ******************************//
221 vector<string> listFileNames;
223 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
225 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
227 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
228 dividedNames.resize(processors);
230 //for each file group figure out which process will complete it
231 //want to divide the load intelligently so the big files are spread between processes
233 for (int i = 0; i < distName.size(); i++) {
234 int processToAssign = (i+1) % processors;
235 if (processToAssign == 0) { processToAssign = processors; }
237 dividedNames[(processToAssign-1)].push_back(distName[i]);
240 //not lets reverse the order of ever other process, so we balance big files running with little ones
241 for (int i = 0; i < processors; i++) {
242 int remainder = ((i+1) % processors);
243 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
246 createProcesses(dividedNames);
248 if (m->control_pressed) { return 0; }
250 //get list of list file names from each process
251 for(int i=0;i<processors;i++){
252 string filename = toString(processIDS[i]) + ".temp";
254 openInputFile(filename, in);
258 in >> tempName; gobble(in);
259 listFileNames.push_back(tempName);
262 remove((toString(processIDS[i]) + ".temp").c_str());
265 filename = toString(processIDS[i]) + ".temp.labels";
267 openInputFile(filename, in2);
271 in2 >> tempName; gobble(in);
272 if (labels.count(tempName) == 0) { labels.insert(tempName); }
275 remove((toString(processIDS[i]) + ".temp.labels").c_str());
279 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
282 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
284 //****************** merge list file and create rabund and sabund files ******************************//
286 mergeLists(listFileNames, singletonName, labels);
288 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
290 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
292 m->mothurOutEndLine();
293 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
294 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
295 m->mothurOutEndLine();
299 catch(exception& e) {
300 m->errorOut(e, "ClusterSplitCommand", "execute");
304 //**********************************************************************************************************************
305 int ClusterSplitCommand::mergeLists(vector<string> listNames, string singleton, set<string> userLabels){
307 if (outputDir == "") { outputDir += hasPath(distfile); }
308 fileroot = outputDir + getRootName(getSimpleName(distfile));
310 openOutputFile(fileroot+ tag + ".sabund", outSabund);
311 openOutputFile(fileroot+ tag + ".rabund", outRabund);
312 openOutputFile(fileroot+ tag + ".list", outList);
314 outputNames.push_back(fileroot+ tag + ".sabund");
315 outputNames.push_back(fileroot+ tag + ".rabund");
316 outputNames.push_back(fileroot+ tag + ".list");
319 ListVector* listSingle = NULL;
320 if (singleton != "none") {
322 openInputFile(singleton, in);
324 string firstCol, secondCol;
325 listSingle = new ListVector();
327 in >> firstCol >> secondCol; gobble(in);
328 listSingle->push_back(secondCol);
333 vector<float> orderFloat;
335 //go through users set and make them floats so we can sort them
336 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
339 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)){
341 orderFloat.push_back(temp);
342 }else if (*it == "unique") { orderFloat.push_back(-1.0); }
344 userLabels.erase(*it);
350 sort(orderFloat.begin(), orderFloat.end());
352 vector<InputData*> inputs;
353 vector<string> lastLabels;
354 for (int i = 0; i < listNames.size(); i++) {
355 InputData* input = new InputData(listNames[i], "list");
356 inputs.push_back(input);
359 openInputFile(listNames[i], in);
360 ListVector tempList(in);
361 lastLabels.push_back(tempList.getLabel());
365 ListVector* merged = NULL;
367 //for each label needed
368 for(int l = 0; l < orderFloat.size(); l++){
371 if (orderFloat[l] == -1) { thisLabel = "unique"; }
372 else { thisLabel = toString(orderFloat[l], length-1); }
374 //get the list info from each file
375 for (int k = 0; k < listNames.size(); k++) {
377 if (m->control_pressed) {
378 if (listSingle != NULL) { delete listSingle; remove(singleton.c_str()); }
379 for (int i = 0; i < listNames.size(); i++) { delete inputs[i]; remove(listNames[i].c_str()); }
380 delete merged; merged = NULL;
384 ListVector* list = inputs[k]->getListVector();
386 //this file has reached the end
387 if (list == NULL) { list = inputs[k]->getListVector(lastLabels[k], true); }
390 if (list->getLabel() == "unique") { labelFloat = -1.0; }
391 else { convert(list->getLabel(), labelFloat); }
393 //check for missing labels
394 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
395 //if its bigger get last label, otherwise keep it
397 list = inputs[k]->getListVector(lastLabels[k], true); //get last list vector to use, you actually want to move back in the file
399 lastLabels[k] = list->getLabel();
401 //is this the first file
402 if (merged == NULL) { merged = new ListVector(); merged->setLabel(thisLabel); }
404 for (int j = 0; j < list->getNumBins(); j++) {
405 merged->push_back(list->get(j));
412 for (int j = 0; j < listSingle->getNumBins(); j++) {
413 merged->push_back(listSingle->get(j));
419 delete merged; merged = NULL;
422 if (listSingle != NULL) { delete listSingle; remove(singleton.c_str()); }
424 for (int i = 0; i < listNames.size(); i++) { delete inputs[i]; remove(listNames[i].c_str()); }
428 catch(exception& e) {
429 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
433 //**********************************************************************************************************************
435 void ClusterSplitCommand::printData(ListVector* oldList){
437 string label = oldList->getLabel();
438 RAbundVector oldRAbund = oldList->getRAbundVector();
440 oldRAbund.setLabel(label);
441 if (isTrue(showabund)) {
442 oldRAbund.getSAbundVector().print(cout);
444 oldRAbund.print(outRabund);
445 oldRAbund.getSAbundVector().print(outSabund);
447 oldList->print(outList);
449 catch(exception& e) {
450 m->errorOut(e, "ClusterSplitCommand", "printData");
454 //**********************************************************************************************************************
455 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
458 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
463 //loop through and create all the processes you want
464 while (process != processors) {
468 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
472 vector<string> listFileNames = cluster(dividedNames[process], labels);
474 //write out names to file
475 string filename = toString(getpid()) + ".temp";
477 openOutputFile(filename, out);
478 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
483 filename = toString(getpid()) + ".temp.labels";
484 openOutputFile(filename, outLabels);
486 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
487 outLabels << (*it) << endl;
492 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
495 //force parent to wait until all the processes are done
496 for (int i=0;i<processors;i++) {
497 int temp = processIDS[i];
505 catch(exception& e) {
506 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
510 //**********************************************************************************************************************
512 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
515 SparseMatrix* matrix;
518 RAbundVector* rabund;
520 vector<string> listFileNames;
522 //cluster each distance file
523 for (int i = 0; i < distNames.size(); i++) {
525 string thisNamefile = distNames[i].begin()->second;
526 string thisDistFile = distNames[i].begin()->first;
528 //read in distance file
529 globaldata->setNameFile(thisNamefile);
530 globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
532 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
533 read->setCutoff(cutoff);
535 NameAssignment* nameMap = new NameAssignment(thisNamefile);
539 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
541 list = read->getListVector();
543 matrix = read->getMatrix();
548 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
550 rabund = new RAbundVector(list->getRAbundVector());
553 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
554 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
555 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
556 tag = cluster->getTag();
558 if (outputDir == "") { outputDir += hasPath(thisDistFile); }
559 fileroot = outputDir + getRootName(getSimpleName(thisDistFile));
562 openOutputFile(fileroot+ tag + ".list", listFile);
564 listFileNames.push_back(fileroot+ tag + ".list");
566 time_t estart = time(NULL);
568 float previousDist = 0.00000;
569 float rndPreviousDist = 0.00000;
575 double saveCutoff = cutoff;
577 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
579 if (m->control_pressed) { //clean up
580 delete matrix; delete list; delete cluster; delete rabund;
582 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
583 listFileNames.clear(); return listFileNames;
586 cluster->update(cutoff);
588 float dist = matrix->getSmallDist();
589 float rndDist = roundDist(dist, precision);
591 if(previousDist <= 0.0000 && dist != previousDist){
592 oldList.setLabel("unique");
593 oldList.print(listFile);
594 if (labels.count("unique") == 0) { labels.insert("unique"); }
596 else if(rndDist != rndPreviousDist){
597 oldList.setLabel(toString(rndPreviousDist, length-1));
598 oldList.print(listFile);
599 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
603 rndPreviousDist = rndDist;
608 if(previousDist <= 0.0000){
609 oldList.setLabel("unique");
610 oldList.print(listFile);
611 if (labels.count("unique") == 0) { labels.insert("unique"); }
613 else if(rndPreviousDist<cutoff){
614 oldList.setLabel(toString(rndPreviousDist, length-1));
615 oldList.print(listFile);
616 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
619 delete matrix; delete list; delete cluster; delete rabund;
622 if (m->control_pressed) { //clean up
623 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
624 listFileNames.clear(); return listFileNames;
627 remove(thisDistFile.c_str());
628 remove(thisNamefile.c_str());
632 return listFileNames;
635 catch(exception& e) {
636 m->errorOut(e, "ClusterSplitCommand", "cluster");
643 //**********************************************************************************************************************